src/gromacs/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <stdio.h>
  41 #include <time.h>
  42 #include <math.h>
  43 #include <string.h>
  44 #include <stdlib.h>
  45 #include <assert.h>
  46
  47 #include "typedefs.h"
  48 #include "gromacs/utility/smalloc.h"
  49 #include "gmx_fatal.h"
  50 #include "gmx_fatal_collective.h"
  51 #include "vec.h"
  52 #include "domdec.h"
  53 #include "domdec_network.h"
  54 #include "nrnb.h"
  55 #include "pbc.h"
  56 #include "chargegroup.h"
  57 #include "constr.h"
  58 #include "mdatoms.h"
  59 #include "names.h"
  60 #include "force.h"
  61 #include "pme.h"
  62 #include "mdrun.h"
  63 #include "nsgrid.h"
  64 #include "shellfc.h"
  65 #include "mtop_util.h"
  66 #include "gmx_ga2la.h"
  67 #include "macros.h"
  68 #include "nbnxn_search.h"
  69 #include "bondf.h"
  70 #include "gmx_omp_nthreads.h"
  71 #include "gpu_utils.h"
  72
  73 #include "gromacs/fileio/futil.h"
  74 #include "gromacs/fileio/gmxfio.h"
  75 #include "gromacs/fileio/pdbio.h"
  76 #include "gromacs/timing/wallcycle.h"
  77 #include "gromacs/utility/gmxmpi.h"
  78 #include "gromacs/swap/swapcoords.h"
  79 #include "gromacs/utility/qsort_threadsafe.h"
  80 #include "gromacs/pulling/pull.h"
  81 #include "gromacs/pulling/pull_rotation.h"
  82 #include "gromacs/imd/imd.h"
  83
  84 #define DDRANK(dd, rank)    (rank)
  85 #define DDMASTERRANK(dd)   (dd->masterrank)
  86
  87 typedef struct gmx_domdec_master
  88 {
  89     /* The cell boundaries */
  90     real **cell_x;
  91     /* The global charge group division */
  92     int   *ncg;    /* Number of home charge groups for each node */
  93     int   *index;  /* Index of nnodes+1 into cg */
  94     int   *cg;     /* Global charge group index */
  95     int   *nat;    /* Number of home atoms for each node. */
  96     int   *ibuf;   /* Buffer for communication */
  97     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  98 } gmx_domdec_master_t;
  99
 100 typedef struct
 101 {
 102     /* The numbers of charge groups to send and receive for each cell
 103      * that requires communication, the last entry contains the total
 104      * number of atoms that needs to be communicated.
 105      */
 106     int  nsend[DD_MAXIZONE+2];
 107     int  nrecv[DD_MAXIZONE+2];
 108     /* The charge groups to send */
 109     int *index;
 110     int  nalloc;
 111     /* The atom range for non-in-place communication */
 112     int  cell2at0[DD_MAXIZONE];
 113     int  cell2at1[DD_MAXIZONE];
 114 } gmx_domdec_ind_t;
 115
 116 typedef struct
 117 {
 118     int               np;       /* Number of grid pulses in this dimension */
 119     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 120     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 121     int               np_nalloc;
 122     gmx_bool          bInPlace; /* Can we communicate in place?            */
 123 } gmx_domdec_comm_dim_t;
 124
 125 typedef struct
 126 {
 127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 128     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 129     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 130     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 131     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 132     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 133     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 134     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 135     real     *buf_ncd;     /* Temp. var.                                     */
 136 } gmx_domdec_root_t;
 137
 138 #define DD_NLOAD_MAX 9
 139
 140 /* Here floats are accurate enough, since these variables
 141  * only influence the load balancing, not the actual MD results.
 142  */
 143 typedef struct
 144 {
 145     int    nload;
 146     float *load;
 147     float  sum;
 148     float  max;
 149     float  sum_m;
 150     float  cvol_min;
 151     float  mdf;
 152     float  pme;
 153     int    flags;
 154 } gmx_domdec_load_t;
 155
 156 typedef struct
 157 {
 158     int  nsc;
 159     int  ind_gl;
 160     int  ind;
 161 } gmx_cgsort_t;
 162
 163 typedef struct
 164 {
 165     gmx_cgsort_t *sort;
 166     gmx_cgsort_t *sort2;
 167     int           sort_nalloc;
 168     gmx_cgsort_t *sort_new;
 169     int           sort_new_nalloc;
 170     int          *ibuf;
 171     int           ibuf_nalloc;
 172 } gmx_domdec_sort_t;
 173
 174 typedef struct
 175 {
 176     rvec *v;
 177     int   nalloc;
 178 } vec_rvec_t;
 179
 180 /* This enum determines the order of the coordinates.
 181  * ddnatHOME and ddnatZONE should be first and second,
 182  * the others can be ordered as wanted.
 183  */
 184 enum {
 185     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 186 };
 187
 188 enum {
 189     edlbAUTO, edlbNO, edlbYES, edlbNR
 190 };
 191 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 192
 193 typedef struct
 194 {
 195     int      dim;       /* The dimension                                          */
 196     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 197     int      nslab;     /* The number of PME slabs in this dimension              */
 198     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 199     int     *pp_min;    /* The minimum pp node location, size nslab               */
 200     int     *pp_max;    /* The maximum pp node location,size nslab                */
 201     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 202 } gmx_ddpme_t;
 203
 204 typedef struct
 205 {
 206     real min0;    /* The minimum bottom of this zone                        */
 207     real max1;    /* The maximum top of this zone                           */
 208     real min1;    /* The minimum top of this zone                           */
 209     real mch0;    /* The maximum bottom communicaton height for this zone   */
 210     real mch1;    /* The maximum top communicaton height for this zone      */
 211     real p1_0;    /* The bottom value of the first cell in this zone        */
 212     real p1_1;    /* The top value of the first cell in this zone           */
 213 } gmx_ddzone_t;
 214
 215 typedef struct
 216 {
 217     gmx_domdec_ind_t ind;
 218     int             *ibuf;
 219     int              ibuf_nalloc;
 220     vec_rvec_t       vbuf;
 221     int              nsend;
 222     int              nat;
 223     int              nsend_zone;
 224 } dd_comm_setup_work_t;
 225
 226 typedef struct gmx_domdec_comm
 227 {
 228     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 229      * unless stated otherwise.
 230      */
 231
 232     /* The number of decomposition dimensions for PME, 0: no PME */
 233     int         npmedecompdim;
 234     /* The number of nodes doing PME (PP/PME or only PME) */
 235     int         npmenodes;
 236     int         npmenodes_x;
 237     int         npmenodes_y;
 238     /* The communication setup including the PME only nodes */
 239     gmx_bool    bCartesianPP_PME;
 240     ivec        ntot;
 241     int         cartpmedim;
 242     int        *pmenodes;          /* size npmenodes                         */
 243     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 244                                     * but with bCartesianPP_PME              */
 245     gmx_ddpme_t ddpme[2];
 246
 247     /* The DD particle-particle nodes only */
 248     gmx_bool bCartesianPP;
 249     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 250
 251     /* The global charge groups */
 252     t_block cgs_gl;
 253
 254     /* Should we sort the cgs */
 255     int                nstSortCG;
 256     gmx_domdec_sort_t *sort;
 257
 258     /* Are there charge groups? */
 259     gmx_bool bCGs;
 260
 261     /* Are there bonded and multi-body interactions between charge groups? */
 262     gmx_bool bInterCGBondeds;
 263     gmx_bool bInterCGMultiBody;
 264
 265     /* Data for the optional bonded interaction atom communication range */
 266     gmx_bool  bBondComm;
 267     t_blocka *cglink;
 268     char     *bLocalCG;
 269
 270     /* The DLB option */
 271     int      eDLB;
 272     /* Are we actually using DLB? */
 273     gmx_bool bDynLoadBal;
 274
 275     /* Cell sizes for static load balancing, first index cartesian */
 276     real **slb_frac;
 277
 278     /* The width of the communicated boundaries */
 279     real     cutoff_mbody;
 280     real     cutoff;
 281     /* The minimum cell size (including triclinic correction) */
 282     rvec     cellsize_min;
 283     /* For dlb, for use with edlbAUTO */
 284     rvec     cellsize_min_dlb;
 285     /* The lower limit for the DD cell size with DLB */
 286     real     cellsize_limit;
 287     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 288     gmx_bool bVacDLBNoLimit;
 289
 290     /* With PME load balancing we set limits on DLB */
 291     gmx_bool bPMELoadBalDLBLimits;
 292     /* DLB needs to take into account that we want to allow this maximum
 293      * cut-off (for PME load balancing), this could limit cell boundaries.
 294      */
 295     real PMELoadBal_max_cutoff;
 296
 297     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 298     ivec tric_dir;
 299     /* box0 and box_size are required with dim's without pbc and -gcom */
 300     rvec box0;
 301     rvec box_size;
 302
 303     /* The cell boundaries */
 304     rvec cell_x0;
 305     rvec cell_x1;
 306
 307     /* The old location of the cell boundaries, to check cg displacements */
 308     rvec old_cell_x0;
 309     rvec old_cell_x1;
 310
 311     /* The communication setup and charge group boundaries for the zones */
 312     gmx_domdec_zones_t zones;
 313
 314     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 315      * cell boundaries of neighboring cells for dynamic load balancing.
 316      */
 317     gmx_ddzone_t zone_d1[2];
 318     gmx_ddzone_t zone_d2[2][2];
 319
 320     /* The coordinate/force communication setup and indices */
 321     gmx_domdec_comm_dim_t cd[DIM];
 322     /* The maximum number of cells to communicate with in one dimension */
 323     int                   maxpulse;
 324
 325     /* Which cg distribution is stored on the master node */
 326     int master_cg_ddp_count;
 327
 328     /* The number of cg's received from the direct neighbors */
 329     int  zone_ncg1[DD_MAXZONE];
 330
 331     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 332     int  nat[ddnatNR];
 333
 334     /* Array for signalling if atoms have moved to another domain */
 335     int  *moved;
 336     int   moved_nalloc;
 337
 338     /* Communication buffer for general use */
 339     int  *buf_int;
 340     int   nalloc_int;
 341
 342     /* Communication buffer for general use */
 343     vec_rvec_t vbuf;
 344
 345     /* Temporary storage for thread parallel communication setup */
 346     int                   nth;
 347     dd_comm_setup_work_t *dth;
 348
 349     /* Communication buffers only used with multiple grid pulses */
 350     int       *buf_int2;
 351     int        nalloc_int2;
 352     vec_rvec_t vbuf2;
 353
 354     /* Communication buffers for local redistribution */
 355     int  **cggl_flag;
 356     int    cggl_flag_nalloc[DIM*2];
 357     rvec **cgcm_state;
 358     int    cgcm_state_nalloc[DIM*2];
 359
 360     /* Cell sizes for dynamic load balancing */
 361     gmx_domdec_root_t **root;
 362     real               *cell_f_row;
 363     real                cell_f0[DIM];
 364     real                cell_f1[DIM];
 365     real                cell_f_max0[DIM];
 366     real                cell_f_min1[DIM];
 367
 368     /* Stuff for load communication */
 369     gmx_bool           bRecordLoad;
 370     gmx_domdec_load_t *load;
 371     int                nrank_gpu_shared;
 372 #ifdef GMX_MPI
 373     MPI_Comm          *mpi_comm_load;
 374     MPI_Comm           mpi_comm_gpu_shared;
 375 #endif
 376
 377     /* Maximum DLB scaling per load balancing step in percent */
 378     int dlb_scale_lim;
 379
 380     /* Cycle counters */
 381     float  cycl[ddCyclNr];
 382     int    cycl_n[ddCyclNr];
 383     float  cycl_max[ddCyclNr];
 384     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 385     int    eFlop;
 386     double flop;
 387     int    flop_n;
 388     /* Have often have did we have load measurements */
 389     int    n_load_have;
 390     /* Have often have we collected the load measurements */
 391     int    n_load_collect;
 392
 393     /* Statistics */
 394     double sum_nat[ddnatNR-ddnatZONE];
 395     int    ndecomp;
 396     int    nload;
 397     double load_step;
 398     double load_sum;
 399     double load_max;
 400     ivec   load_lim;
 401     double load_mdf;
 402     double load_pme;
 403
 404     /* The last partition step */
 405     gmx_int64_t partition_step;
 406
 407     /* Debugging */
 408     int  nstDDDump;
 409     int  nstDDDumpGrid;
 410     int  DD_debug;
 411 } gmx_domdec_comm_t;
 412
 413 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 414 #define DD_CGIBS 2
 415
 416 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 417 #define DD_FLAG_NRCG  65535
 418 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 419 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 420
 421 /* Zone permutation required to obtain consecutive charge groups
 422  * for neighbor searching.
 423  */
 424 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 425
 426 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 427  * components see only j zones with that component 0.
 428  */
 429
 430 /* The DD zone order */
 431 static const ivec dd_zo[DD_MAXZONE] =
 432 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 433
 434 /* The 3D setup */
 435 #define dd_z3n  8
 436 #define dd_zp3n 4
 437 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 438
 439 /* The 2D setup */
 440 #define dd_z2n  4
 441 #define dd_zp2n 2
 442 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 443
 444 /* The 1D setup */
 445 #define dd_z1n  2
 446 #define dd_zp1n 1
 447 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 448
 449 /* Factors used to avoid problems due to rounding issues */
 450 #define DD_CELL_MARGIN       1.0001
 451 #define DD_CELL_MARGIN2      1.00005
 452 /* Factor to account for pressure scaling during nstlist steps */
 453 #define DD_PRES_SCALE_MARGIN 1.02
 454
 455 /* Turn on DLB when the load imbalance causes this amount of total loss.
 456  * There is a bit of overhead with DLB and it's difficult to achieve
 457  * a load imbalance of less than 2% with DLB.
 458  */
 459 #define DD_PERF_LOSS_DLB_ON  0.02
 460
 461 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 462 #define DD_PERF_LOSS_WARN    0.05
 463
 464 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 465
 466 /* Use separate MPI send and receive commands
 467  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 468  * This saves memory (and some copying for small nnodes).
 469  * For high parallelization scatter and gather calls are used.
 470  */
 471 #define GMX_DD_NNODES_SENDRECV 4
 472
 473
 474 /*
 475    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 476
 477    static void index2xyz(ivec nc,int ind,ivec xyz)
 478    {
 479    xyz[XX] = ind % nc[XX];
 480    xyz[YY] = (ind / nc[XX]) % nc[YY];
 481    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 482    }
 483  */
 484
 485 /* This order is required to minimize the coordinate communication in PME
 486  * which uses decomposition in the x direction.
 487  */
 488 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 489
 490 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 491 {
 492     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 493     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 494     xyz[ZZ] = ind % nc[ZZ];
 495 }
 496
 497 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 498 {
 499     int ddindex;
 500     int ddnodeid = -1;
 501
 502     ddindex = dd_index(dd->nc, c);
 503     if (dd->comm->bCartesianPP_PME)
 504     {
 505         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 506     }
 507     else if (dd->comm->bCartesianPP)
 508     {
 509 #ifdef GMX_MPI
 510         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 511 #endif
 512     }
 513     else
 514     {
 515         ddnodeid = ddindex;
 516     }
 517
 518     return ddnodeid;
 519 }
 520
 521 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 522 {
 523     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 524 }
 525
 526 int ddglatnr(gmx_domdec_t *dd, int i)
 527 {
 528     int atnr;
 529
 530     if (dd == NULL)
 531     {
 532         atnr = i + 1;
 533     }
 534     else
 535     {
 536         if (i >= dd->comm->nat[ddnatNR-1])
 537         {
 538             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 539         }
 540         atnr = dd->gatindex[i] + 1;
 541     }
 542
 543     return atnr;
 544 }
 545
 546 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 547 {
 548     return &dd->comm->cgs_gl;
 549 }
 550
 551 static void vec_rvec_init(vec_rvec_t *v)
 552 {
 553     v->nalloc = 0;
 554     v->v      = NULL;
 555 }
 556
 557 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 558 {
 559     if (n > v->nalloc)
 560     {
 561         v->nalloc = over_alloc_dd(n);
 562         srenew(v->v, v->nalloc);
 563     }
 564 }
 565
 566 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 567 {
 568     int i;
 569
 570     if (state->ddp_count != dd->ddp_count)
 571     {
 572         gmx_incons("The state does not the domain decomposition state");
 573     }
 574
 575     state->ncg_gl = dd->ncg_home;
 576     if (state->ncg_gl > state->cg_gl_nalloc)
 577     {
 578         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 579         srenew(state->cg_gl, state->cg_gl_nalloc);
 580     }
 581     for (i = 0; i < state->ncg_gl; i++)
 582     {
 583         state->cg_gl[i] = dd->index_gl[i];
 584     }
 585
 586     state->ddp_count_cg_gl = dd->ddp_count;
 587 }
 588
 589 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 590 {
 591     return &dd->comm->zones;
 592 }
 593
 594 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 595                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 596 {
 597     gmx_domdec_zones_t *zones;
 598     int                 izone, d, dim;
 599
 600     zones = &dd->comm->zones;
 601
 602     izone = 0;
 603     while (icg >= zones->izone[izone].cg1)
 604     {
 605         izone++;
 606     }
 607
 608     if (izone == 0)
 609     {
 610         *jcg0 = icg;
 611     }
 612     else if (izone < zones->nizone)
 613     {
 614         *jcg0 = zones->izone[izone].jcg0;
 615     }
 616     else
 617     {
 618         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 619                   icg, izone, zones->nizone);
 620     }
 621
 622     *jcg1 = zones->izone[izone].jcg1;
 623
 624     for (d = 0; d < dd->ndim; d++)
 625     {
 626         dim         = dd->dim[d];
 627         shift0[dim] = zones->izone[izone].shift0[dim];
 628         shift1[dim] = zones->izone[izone].shift1[dim];
 629         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 630         {
 631             /* A conservative approach, this can be optimized */
 632             shift0[dim] -= 1;
 633             shift1[dim] += 1;
 634         }
 635     }
 636 }
 637
 638 int dd_natoms_vsite(gmx_domdec_t *dd)
 639 {
 640     return dd->comm->nat[ddnatVSITE];
 641 }
 642
 643 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 644 {
 645     *at_start = dd->comm->nat[ddnatCON-1];
 646     *at_end   = dd->comm->nat[ddnatCON];
 647 }
 648
 649 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 650 {
 651     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 652     int                   *index, *cgindex;
 653     gmx_domdec_comm_t     *comm;
 654     gmx_domdec_comm_dim_t *cd;
 655     gmx_domdec_ind_t      *ind;
 656     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 657     gmx_bool               bPBC, bScrew;
 658
 659     comm = dd->comm;
 660
 661     cgindex = dd->cgindex;
 662
 663     buf = comm->vbuf.v;
 664
 665     nzone   = 1;
 666     nat_tot = dd->nat_home;
 667     for (d = 0; d < dd->ndim; d++)
 668     {
 669         bPBC   = (dd->ci[dd->dim[d]] == 0);
 670         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 671         if (bPBC)
 672         {
 673             copy_rvec(box[dd->dim[d]], shift);
 674         }
 675         cd = &comm->cd[d];
 676         for (p = 0; p < cd->np; p++)
 677         {
 678             ind   = &cd->ind[p];
 679             index = ind->index;
 680             n     = 0;
 681             if (!bPBC)
 682             {
 683                 for (i = 0; i < ind->nsend[nzone]; i++)
 684                 {
 685                     at0 = cgindex[index[i]];
 686                     at1 = cgindex[index[i]+1];
 687                     for (j = at0; j < at1; j++)
 688                     {
 689                         copy_rvec(x[j], buf[n]);
 690                         n++;
 691                     }
 692                 }
 693             }
 694             else if (!bScrew)
 695             {
 696                 for (i = 0; i < ind->nsend[nzone]; i++)
 697                 {
 698                     at0 = cgindex[index[i]];
 699                     at1 = cgindex[index[i]+1];
 700                     for (j = at0; j < at1; j++)
 701                     {
 702                         /* We need to shift the coordinates */
 703                         rvec_add(x[j], shift, buf[n]);
 704                         n++;
 705                     }
 706                 }
 707             }
 708             else
 709             {
 710                 for (i = 0; i < ind->nsend[nzone]; i++)
 711                 {
 712                     at0 = cgindex[index[i]];
 713                     at1 = cgindex[index[i]+1];
 714                     for (j = at0; j < at1; j++)
 715                     {
 716                         /* Shift x */
 717                         buf[n][XX] = x[j][XX] + shift[XX];
 718                         /* Rotate y and z.
 719                          * This operation requires a special shift force
 720                          * treatment, which is performed in calc_vir.
 721                          */
 722                         buf[n][YY] = box[YY][YY] - x[j][YY];
 723                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 724                         n++;
 725                     }
 726                 }
 727             }
 728
 729             if (cd->bInPlace)
 730             {
 731                 rbuf = x + nat_tot;
 732             }
 733             else
 734             {
 735                 rbuf = comm->vbuf2.v;
 736             }
 737             /* Send and receive the coordinates */
 738             dd_sendrecv_rvec(dd, d, dddirBackward,
 739                              buf,  ind->nsend[nzone+1],
 740                              rbuf, ind->nrecv[nzone+1]);
 741             if (!cd->bInPlace)
 742             {
 743                 j = 0;
 744                 for (zone = 0; zone < nzone; zone++)
 745                 {
 746                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 747                     {
 748                         copy_rvec(rbuf[j], x[i]);
 749                         j++;
 750                     }
 751                 }
 752             }
 753             nat_tot += ind->nrecv[nzone+1];
 754         }
 755         nzone += nzone;
 756     }
 757 }
 758
 759 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 760 {
 761     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 762     int                   *index, *cgindex;
 763     gmx_domdec_comm_t     *comm;
 764     gmx_domdec_comm_dim_t *cd;
 765     gmx_domdec_ind_t      *ind;
 766     rvec                  *buf, *sbuf;
 767     ivec                   vis;
 768     int                    is;
 769     gmx_bool               bPBC, bScrew;
 770
 771     comm = dd->comm;
 772
 773     cgindex = dd->cgindex;
 774
 775     buf = comm->vbuf.v;
 776
 777     n       = 0;
 778     nzone   = comm->zones.n/2;
 779     nat_tot = dd->nat_tot;
 780     for (d = dd->ndim-1; d >= 0; d--)
 781     {
 782         bPBC   = (dd->ci[dd->dim[d]] == 0);
 783         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 784         if (fshift == NULL && !bScrew)
 785         {
 786             bPBC = FALSE;
 787         }
 788         /* Determine which shift vector we need */
 789         clear_ivec(vis);
 790         vis[dd->dim[d]] = 1;
 791         is              = IVEC2IS(vis);
 792
 793         cd = &comm->cd[d];
 794         for (p = cd->np-1; p >= 0; p--)
 795         {
 796             ind      = &cd->ind[p];
 797             nat_tot -= ind->nrecv[nzone+1];
 798             if (cd->bInPlace)
 799             {
 800                 sbuf = f + nat_tot;
 801             }
 802             else
 803             {
 804                 sbuf = comm->vbuf2.v;
 805                 j    = 0;
 806                 for (zone = 0; zone < nzone; zone++)
 807                 {
 808                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 809                     {
 810                         copy_rvec(f[i], sbuf[j]);
 811                         j++;
 812                     }
 813                 }
 814             }
 815             /* Communicate the forces */
 816             dd_sendrecv_rvec(dd, d, dddirForward,
 817                              sbuf, ind->nrecv[nzone+1],
 818                              buf,  ind->nsend[nzone+1]);
 819             index = ind->index;
 820             /* Add the received forces */
 821             n = 0;
 822             if (!bPBC)
 823             {
 824                 for (i = 0; i < ind->nsend[nzone]; i++)
 825                 {
 826                     at0 = cgindex[index[i]];
 827                     at1 = cgindex[index[i]+1];
 828                     for (j = at0; j < at1; j++)
 829                     {
 830                         rvec_inc(f[j], buf[n]);
 831                         n++;
 832                     }
 833                 }
 834             }
 835             else if (!bScrew)
 836             {
 837                 for (i = 0; i < ind->nsend[nzone]; i++)
 838                 {
 839                     at0 = cgindex[index[i]];
 840                     at1 = cgindex[index[i]+1];
 841                     for (j = at0; j < at1; j++)
 842                     {
 843                         rvec_inc(f[j], buf[n]);
 844                         /* Add this force to the shift force */
 845                         rvec_inc(fshift[is], buf[n]);
 846                         n++;
 847                     }
 848                 }
 849             }
 850             else
 851             {
 852                 for (i = 0; i < ind->nsend[nzone]; i++)
 853                 {
 854                     at0 = cgindex[index[i]];
 855                     at1 = cgindex[index[i]+1];
 856                     for (j = at0; j < at1; j++)
 857                     {
 858                         /* Rotate the force */
 859                         f[j][XX] += buf[n][XX];
 860                         f[j][YY] -= buf[n][YY];
 861                         f[j][ZZ] -= buf[n][ZZ];
 862                         if (fshift)
 863                         {
 864                             /* Add this force to the shift force */
 865                             rvec_inc(fshift[is], buf[n]);
 866                         }
 867                         n++;
 868                     }
 869                 }
 870             }
 871         }
 872         nzone /= 2;
 873     }
 874 }
 875
 876 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 877 {
 878     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 879     int                   *index, *cgindex;
 880     gmx_domdec_comm_t     *comm;
 881     gmx_domdec_comm_dim_t *cd;
 882     gmx_domdec_ind_t      *ind;
 883     real                  *buf, *rbuf;
 884
 885     comm = dd->comm;
 886
 887     cgindex = dd->cgindex;
 888
 889     buf = &comm->vbuf.v[0][0];
 890
 891     nzone   = 1;
 892     nat_tot = dd->nat_home;
 893     for (d = 0; d < dd->ndim; d++)
 894     {
 895         cd = &comm->cd[d];
 896         for (p = 0; p < cd->np; p++)
 897         {
 898             ind   = &cd->ind[p];
 899             index = ind->index;
 900             n     = 0;
 901             for (i = 0; i < ind->nsend[nzone]; i++)
 902             {
 903                 at0 = cgindex[index[i]];
 904                 at1 = cgindex[index[i]+1];
 905                 for (j = at0; j < at1; j++)
 906                 {
 907                     buf[n] = v[j];
 908                     n++;
 909                 }
 910             }
 911
 912             if (cd->bInPlace)
 913             {
 914                 rbuf = v + nat_tot;
 915             }
 916             else
 917             {
 918                 rbuf = &comm->vbuf2.v[0][0];
 919             }
 920             /* Send and receive the coordinates */
 921             dd_sendrecv_real(dd, d, dddirBackward,
 922                              buf,  ind->nsend[nzone+1],
 923                              rbuf, ind->nrecv[nzone+1]);
 924             if (!cd->bInPlace)
 925             {
 926                 j = 0;
 927                 for (zone = 0; zone < nzone; zone++)
 928                 {
 929                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 930                     {
 931                         v[i] = rbuf[j];
 932                         j++;
 933                     }
 934                 }
 935             }
 936             nat_tot += ind->nrecv[nzone+1];
 937         }
 938         nzone += nzone;
 939     }
 940 }
 941
 942 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 943 {
 944     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 945     int                   *index, *cgindex;
 946     gmx_domdec_comm_t     *comm;
 947     gmx_domdec_comm_dim_t *cd;
 948     gmx_domdec_ind_t      *ind;
 949     real                  *buf, *sbuf;
 950
 951     comm = dd->comm;
 952
 953     cgindex = dd->cgindex;
 954
 955     buf = &comm->vbuf.v[0][0];
 956
 957     n       = 0;
 958     nzone   = comm->zones.n/2;
 959     nat_tot = dd->nat_tot;
 960     for (d = dd->ndim-1; d >= 0; d--)
 961     {
 962         cd = &comm->cd[d];
 963         for (p = cd->np-1; p >= 0; p--)
 964         {
 965             ind      = &cd->ind[p];
 966             nat_tot -= ind->nrecv[nzone+1];
 967             if (cd->bInPlace)
 968             {
 969                 sbuf = v + nat_tot;
 970             }
 971             else
 972             {
 973                 sbuf = &comm->vbuf2.v[0][0];
 974                 j    = 0;
 975                 for (zone = 0; zone < nzone; zone++)
 976                 {
 977                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 978                     {
 979                         sbuf[j] = v[i];
 980                         j++;
 981                     }
 982                 }
 983             }
 984             /* Communicate the forces */
 985             dd_sendrecv_real(dd, d, dddirForward,
 986                              sbuf, ind->nrecv[nzone+1],
 987                              buf,  ind->nsend[nzone+1]);
 988             index = ind->index;
 989             /* Add the received forces */
 990             n = 0;
 991             for (i = 0; i < ind->nsend[nzone]; i++)
 992             {
 993                 at0 = cgindex[index[i]];
 994                 at1 = cgindex[index[i]+1];
 995                 for (j = at0; j < at1; j++)
 996                 {
 997                     v[j] += buf[n];
 998                     n++;
 999                 }
1000             }
1001         }
1002         nzone /= 2;
1003     }
1004 }
1005
1006 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
1007 {
1008     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1009             d, i, j,
1010             zone->min0, zone->max1,
1011             zone->mch0, zone->mch0,
1012             zone->p1_0, zone->p1_1);
1013 }
1014
1015
1016 #define DDZONECOMM_MAXZONE  5
1017 #define DDZONECOMM_BUFSIZE  3
1018
1019 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1020                                int ddimind, int direction,
1021                                gmx_ddzone_t *buf_s, int n_s,
1022                                gmx_ddzone_t *buf_r, int n_r)
1023 {
1024 #define ZBS  DDZONECOMM_BUFSIZE
1025     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1026     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1027     int  i;
1028
1029     for (i = 0; i < n_s; i++)
1030     {
1031         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1032         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1033         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1034         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1035         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1036         vbuf_s[i*ZBS+1][2] = 0;
1037         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1038         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1039         vbuf_s[i*ZBS+2][2] = 0;
1040     }
1041
1042     dd_sendrecv_rvec(dd, ddimind, direction,
1043                      vbuf_s, n_s*ZBS,
1044                      vbuf_r, n_r*ZBS);
1045
1046     for (i = 0; i < n_r; i++)
1047     {
1048         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1049         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1050         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1051         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1052         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1053         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1054         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1055     }
1056
1057 #undef ZBS
1058 }
1059
1060 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1061                           rvec cell_ns_x0, rvec cell_ns_x1)
1062 {
1063     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1064     gmx_ddzone_t      *zp;
1065     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1066     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1067     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1068     rvec               extr_s[2], extr_r[2];
1069     rvec               dh;
1070     real               dist_d, c = 0, det;
1071     gmx_domdec_comm_t *comm;
1072     gmx_bool           bPBC, bUse;
1073
1074     comm = dd->comm;
1075
1076     for (d = 1; d < dd->ndim; d++)
1077     {
1078         dim      = dd->dim[d];
1079         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1080         zp->min0 = cell_ns_x0[dim];
1081         zp->max1 = cell_ns_x1[dim];
1082         zp->min1 = cell_ns_x1[dim];
1083         zp->mch0 = cell_ns_x0[dim];
1084         zp->mch1 = cell_ns_x1[dim];
1085         zp->p1_0 = cell_ns_x0[dim];
1086         zp->p1_1 = cell_ns_x1[dim];
1087     }
1088
1089     for (d = dd->ndim-2; d >= 0; d--)
1090     {
1091         dim  = dd->dim[d];
1092         bPBC = (dim < ddbox->npbcdim);
1093
1094         /* Use an rvec to store two reals */
1095         extr_s[d][0] = comm->cell_f0[d+1];
1096         extr_s[d][1] = comm->cell_f1[d+1];
1097         extr_s[d][2] = comm->cell_f1[d+1];
1098
1099         pos = 0;
1100         /* Store the extremes in the backward sending buffer,
1101          * so the get updated separately from the forward communication.
1102          */
1103         for (d1 = d; d1 < dd->ndim-1; d1++)
1104         {
1105             /* We invert the order to be able to use the same loop for buf_e */
1106             buf_s[pos].min0 = extr_s[d1][1];
1107             buf_s[pos].max1 = extr_s[d1][0];
1108             buf_s[pos].min1 = extr_s[d1][2];
1109             buf_s[pos].mch0 = 0;
1110             buf_s[pos].mch1 = 0;
1111             /* Store the cell corner of the dimension we communicate along */
1112             buf_s[pos].p1_0 = comm->cell_x0[dim];
1113             buf_s[pos].p1_1 = 0;
1114             pos++;
1115         }
1116
1117         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1118         pos++;
1119
1120         if (dd->ndim == 3 && d == 0)
1121         {
1122             buf_s[pos] = comm->zone_d2[0][1];
1123             pos++;
1124             buf_s[pos] = comm->zone_d1[0];
1125             pos++;
1126         }
1127
1128         /* We only need to communicate the extremes
1129          * in the forward direction
1130          */
1131         npulse = comm->cd[d].np;
1132         if (bPBC)
1133         {
1134             /* Take the minimum to avoid double communication */
1135             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1136         }
1137         else
1138         {
1139             /* Without PBC we should really not communicate over
1140              * the boundaries, but implementing that complicates
1141              * the communication setup and therefore we simply
1142              * do all communication, but ignore some data.
1143              */
1144             npulse_min = npulse;
1145         }
1146         for (p = 0; p < npulse_min; p++)
1147         {
1148             /* Communicate the extremes forward */
1149             bUse = (bPBC || dd->ci[dim] > 0);
1150
1151             dd_sendrecv_rvec(dd, d, dddirForward,
1152                              extr_s+d, dd->ndim-d-1,
1153                              extr_r+d, dd->ndim-d-1);
1154
1155             if (bUse)
1156             {
1157                 for (d1 = d; d1 < dd->ndim-1; d1++)
1158                 {
1159                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1160                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1161                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1162                 }
1163             }
1164         }
1165
1166         buf_size = pos;
1167         for (p = 0; p < npulse; p++)
1168         {
1169             /* Communicate all the zone information backward */
1170             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1171
1172             dd_sendrecv_ddzone(dd, d, dddirBackward,
1173                                buf_s, buf_size,
1174                                buf_r, buf_size);
1175
1176             clear_rvec(dh);
1177             if (p > 0)
1178             {
1179                 for (d1 = d+1; d1 < dd->ndim; d1++)
1180                 {
1181                     /* Determine the decrease of maximum required
1182                      * communication height along d1 due to the distance along d,
1183                      * this avoids a lot of useless atom communication.
1184                      */
1185                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1186
1187                     if (ddbox->tric_dir[dim])
1188                     {
1189                         /* c is the off-diagonal coupling between the cell planes
1190                          * along directions d and d1.
1191                          */
1192                         c = ddbox->v[dim][dd->dim[d1]][dim];
1193                     }
1194                     else
1195                     {
1196                         c = 0;
1197                     }
1198                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1199                     if (det > 0)
1200                     {
1201                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1202                     }
1203                     else
1204                     {
1205                         /* A negative value signals out of range */
1206                         dh[d1] = -1;
1207                     }
1208                 }
1209             }
1210
1211             /* Accumulate the extremes over all pulses */
1212             for (i = 0; i < buf_size; i++)
1213             {
1214                 if (p == 0)
1215                 {
1216                     buf_e[i] = buf_r[i];
1217                 }
1218                 else
1219                 {
1220                     if (bUse)
1221                     {
1222                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1223                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1224                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1225                     }
1226
1227                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1228                     {
1229                         d1 = 1;
1230                     }
1231                     else
1232                     {
1233                         d1 = d + 1;
1234                     }
1235                     if (bUse && dh[d1] >= 0)
1236                     {
1237                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1238                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1239                     }
1240                 }
1241                 /* Copy the received buffer to the send buffer,
1242                  * to pass the data through with the next pulse.
1243                  */
1244                 buf_s[i] = buf_r[i];
1245             }
1246             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1247                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1248             {
1249                 /* Store the extremes */
1250                 pos = 0;
1251
1252                 for (d1 = d; d1 < dd->ndim-1; d1++)
1253                 {
1254                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1255                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1256                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1257                     pos++;
1258                 }
1259
1260                 if (d == 1 || (d == 0 && dd->ndim == 3))
1261                 {
1262                     for (i = d; i < 2; i++)
1263                     {
1264                         comm->zone_d2[1-d][i] = buf_e[pos];
1265                         pos++;
1266                     }
1267                 }
1268                 if (d == 0)
1269                 {
1270                     comm->zone_d1[1] = buf_e[pos];
1271                     pos++;
1272                 }
1273             }
1274         }
1275     }
1276
1277     if (dd->ndim >= 2)
1278     {
1279         dim = dd->dim[1];
1280         for (i = 0; i < 2; i++)
1281         {
1282             if (debug)
1283             {
1284                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1285             }
1286             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1287             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1288         }
1289     }
1290     if (dd->ndim >= 3)
1291     {
1292         dim = dd->dim[2];
1293         for (i = 0; i < 2; i++)
1294         {
1295             for (j = 0; j < 2; j++)
1296             {
1297                 if (debug)
1298                 {
1299                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1300                 }
1301                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1302                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1303             }
1304         }
1305     }
1306     for (d = 1; d < dd->ndim; d++)
1307     {
1308         comm->cell_f_max0[d] = extr_s[d-1][0];
1309         comm->cell_f_min1[d] = extr_s[d-1][1];
1310         if (debug)
1311         {
1312             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1313                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1314         }
1315     }
1316 }
1317
1318 static void dd_collect_cg(gmx_domdec_t *dd,
1319                           t_state      *state_local)
1320 {
1321     gmx_domdec_master_t *ma = NULL;
1322     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1323
1324     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1325     {
1326         /* The master has the correct distribution */
1327         return;
1328     }
1329
1330     if (state_local->ddp_count == dd->ddp_count)
1331     {
1332         /* The local state and DD are in sync, use the DD indices */
1333         ncg_home = dd->ncg_home;
1334         cg       = dd->index_gl;
1335         nat_home = dd->nat_home;
1336     }
1337     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1338     {
1339         /* The DD is out of sync with the local state, but we have stored
1340          * the cg indices with the local state, so we can use those.
1341          */
1342         t_block *cgs_gl;
1343
1344         cgs_gl = &dd->comm->cgs_gl;
1345
1346         ncg_home = state_local->ncg_gl;
1347         cg       = state_local->cg_gl;
1348         nat_home = 0;
1349         for (i = 0; i < ncg_home; i++)
1350         {
1351             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1352         }
1353     }
1354     else
1355     {
1356         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1357     }
1358
1359     buf2[0] = ncg_home;
1360     buf2[1] = nat_home;
1361     if (DDMASTER(dd))
1362     {
1363         ma   = dd->ma;
1364         ibuf = ma->ibuf;
1365     }
1366     else
1367     {
1368         ibuf = NULL;
1369     }
1370     /* Collect the charge group and atom counts on the master */
1371     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1372
1373     if (DDMASTER(dd))
1374     {
1375         ma->index[0] = 0;
1376         for (i = 0; i < dd->nnodes; i++)
1377         {
1378             ma->ncg[i]     = ma->ibuf[2*i];
1379             ma->nat[i]     = ma->ibuf[2*i+1];
1380             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1381
1382         }
1383         /* Make byte counts and indices */
1384         for (i = 0; i < dd->nnodes; i++)
1385         {
1386             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1387             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1388         }
1389         if (debug)
1390         {
1391             fprintf(debug, "Initial charge group distribution: ");
1392             for (i = 0; i < dd->nnodes; i++)
1393             {
1394                 fprintf(debug, " %d", ma->ncg[i]);
1395             }
1396             fprintf(debug, "\n");
1397         }
1398     }
1399
1400     /* Collect the charge group indices on the master */
1401     dd_gatherv(dd,
1402                ncg_home*sizeof(int), cg,
1403                DDMASTER(dd) ? ma->ibuf : NULL,
1404                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1405                DDMASTER(dd) ? ma->cg : NULL);
1406
1407     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1408 }
1409
1410 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1411                                     rvec *lv, rvec *v)
1412 {
1413     gmx_domdec_master_t *ma;
1414     int                  n, i, c, a, nalloc = 0;
1415     rvec                *buf = NULL;
1416     t_block             *cgs_gl;
1417
1418     ma = dd->ma;
1419
1420     if (!DDMASTER(dd))
1421     {
1422 #ifdef GMX_MPI
1423         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1424                  dd->rank, dd->mpi_comm_all);
1425 #endif
1426     }
1427     else
1428     {
1429         /* Copy the master coordinates to the global array */
1430         cgs_gl = &dd->comm->cgs_gl;
1431
1432         n = DDMASTERRANK(dd);
1433         a = 0;
1434         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1435         {
1436             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1437             {
1438                 copy_rvec(lv[a++], v[c]);
1439             }
1440         }
1441
1442         for (n = 0; n < dd->nnodes; n++)
1443         {
1444             if (n != dd->rank)
1445             {
1446                 if (ma->nat[n] > nalloc)
1447                 {
1448                     nalloc = over_alloc_dd(ma->nat[n]);
1449                     srenew(buf, nalloc);
1450                 }
1451 #ifdef GMX_MPI
1452                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1453                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1454 #endif
1455                 a = 0;
1456                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1457                 {
1458                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1459                     {
1460                         copy_rvec(buf[a++], v[c]);
1461                     }
1462                 }
1463             }
1464         }
1465         sfree(buf);
1466     }
1467 }
1468
1469 static void get_commbuffer_counts(gmx_domdec_t *dd,
1470                                   int **counts, int **disps)
1471 {
1472     gmx_domdec_master_t *ma;
1473     int                  n;
1474
1475     ma = dd->ma;
1476
1477     /* Make the rvec count and displacment arrays */
1478     *counts  = ma->ibuf;
1479     *disps   = ma->ibuf + dd->nnodes;
1480     for (n = 0; n < dd->nnodes; n++)
1481     {
1482         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1483         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1484     }
1485 }
1486
1487 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1488                                    rvec *lv, rvec *v)
1489 {
1490     gmx_domdec_master_t *ma;
1491     int                 *rcounts = NULL, *disps = NULL;
1492     int                  n, i, c, a;
1493     rvec                *buf = NULL;
1494     t_block             *cgs_gl;
1495
1496     ma = dd->ma;
1497
1498     if (DDMASTER(dd))
1499     {
1500         get_commbuffer_counts(dd, &rcounts, &disps);
1501
1502         buf = ma->vbuf;
1503     }
1504
1505     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1506
1507     if (DDMASTER(dd))
1508     {
1509         cgs_gl = &dd->comm->cgs_gl;
1510
1511         a = 0;
1512         for (n = 0; n < dd->nnodes; n++)
1513         {
1514             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1515             {
1516                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1517                 {
1518                     copy_rvec(buf[a++], v[c]);
1519                 }
1520             }
1521         }
1522     }
1523 }
1524
1525 void dd_collect_vec(gmx_domdec_t *dd,
1526                     t_state *state_local, rvec *lv, rvec *v)
1527 {
1528     gmx_domdec_master_t *ma;
1529     int                  n, i, c, a, nalloc = 0;
1530     rvec                *buf = NULL;
1531
1532     dd_collect_cg(dd, state_local);
1533
1534     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1535     {
1536         dd_collect_vec_sendrecv(dd, lv, v);
1537     }
1538     else
1539     {
1540         dd_collect_vec_gatherv(dd, lv, v);
1541     }
1542 }
1543
1544
1545 void dd_collect_state(gmx_domdec_t *dd,
1546                       t_state *state_local, t_state *state)
1547 {
1548     int est, i, j, nh;
1549
1550     nh = state->nhchainlength;
1551
1552     if (DDMASTER(dd))
1553     {
1554         for (i = 0; i < efptNR; i++)
1555         {
1556             state->lambda[i] = state_local->lambda[i];
1557         }
1558         state->fep_state = state_local->fep_state;
1559         state->veta      = state_local->veta;
1560         state->vol0      = state_local->vol0;
1561         copy_mat(state_local->box, state->box);
1562         copy_mat(state_local->boxv, state->boxv);
1563         copy_mat(state_local->svir_prev, state->svir_prev);
1564         copy_mat(state_local->fvir_prev, state->fvir_prev);
1565         copy_mat(state_local->pres_prev, state->pres_prev);
1566
1567         for (i = 0; i < state_local->ngtc; i++)
1568         {
1569             for (j = 0; j < nh; j++)
1570             {
1571                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1572                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1573             }
1574             state->therm_integral[i] = state_local->therm_integral[i];
1575         }
1576         for (i = 0; i < state_local->nnhpres; i++)
1577         {
1578             for (j = 0; j < nh; j++)
1579             {
1580                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1581                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1582             }
1583         }
1584     }
1585     for (est = 0; est < estNR; est++)
1586     {
1587         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1588         {
1589             switch (est)
1590             {
1591                 case estX:
1592                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1593                     break;
1594                 case estV:
1595                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1596                     break;
1597                 case estSDX:
1598                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1599                     break;
1600                 case estCGP:
1601                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1602                     break;
1603                 case estDISRE_INITF:
1604                 case estDISRE_RM3TAV:
1605                 case estORIRE_INITF:
1606                 case estORIRE_DTAV:
1607                     break;
1608                 default:
1609                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1610             }
1611         }
1612     }
1613 }
1614
1615 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1616 {
1617     int est;
1618
1619     if (debug)
1620     {
1621         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1622     }
1623
1624     state->nalloc = over_alloc_dd(nalloc);
1625
1626     for (est = 0; est < estNR; est++)
1627     {
1628         if (EST_DISTR(est) && (state->flags & (1<<est)))
1629         {
1630             switch (est)
1631             {
1632                 case estX:
1633                     srenew(state->x, state->nalloc);
1634                     break;
1635                 case estV:
1636                     srenew(state->v, state->nalloc);
1637                     break;
1638                 case estSDX:
1639                     srenew(state->sd_X, state->nalloc);
1640                     break;
1641                 case estCGP:
1642                     srenew(state->cg_p, state->nalloc);
1643                     break;
1644                 case estDISRE_INITF:
1645                 case estDISRE_RM3TAV:
1646                 case estORIRE_INITF:
1647                 case estORIRE_DTAV:
1648                     /* No reallocation required */
1649                     break;
1650                 default:
1651                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1652             }
1653         }
1654     }
1655
1656     if (f != NULL)
1657     {
1658         srenew(*f, state->nalloc);
1659     }
1660 }
1661
1662 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1663                                int nalloc)
1664 {
1665     if (nalloc > fr->cg_nalloc)
1666     {
1667         if (debug)
1668         {
1669             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1670         }
1671         fr->cg_nalloc = over_alloc_dd(nalloc);
1672         srenew(fr->cginfo, fr->cg_nalloc);
1673         if (fr->cutoff_scheme == ecutsGROUP)
1674         {
1675             srenew(fr->cg_cm, fr->cg_nalloc);
1676         }
1677     }
1678     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1679     {
1680         /* We don't use charge groups, we use x in state to set up
1681          * the atom communication.
1682          */
1683         dd_realloc_state(state, f, nalloc);
1684     }
1685 }
1686
1687 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1688                                        rvec *v, rvec *lv)
1689 {
1690     gmx_domdec_master_t *ma;
1691     int                  n, i, c, a, nalloc = 0;
1692     rvec                *buf = NULL;
1693
1694     if (DDMASTER(dd))
1695     {
1696         ma  = dd->ma;
1697
1698         for (n = 0; n < dd->nnodes; n++)
1699         {
1700             if (n != dd->rank)
1701             {
1702                 if (ma->nat[n] > nalloc)
1703                 {
1704                     nalloc = over_alloc_dd(ma->nat[n]);
1705                     srenew(buf, nalloc);
1706                 }
1707                 /* Use lv as a temporary buffer */
1708                 a = 0;
1709                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1710                 {
1711                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1712                     {
1713                         copy_rvec(v[c], buf[a++]);
1714                     }
1715                 }
1716                 if (a != ma->nat[n])
1717                 {
1718                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1719                               a, ma->nat[n]);
1720                 }
1721
1722 #ifdef GMX_MPI
1723                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1724                          DDRANK(dd, n), n, dd->mpi_comm_all);
1725 #endif
1726             }
1727         }
1728         sfree(buf);
1729         n = DDMASTERRANK(dd);
1730         a = 0;
1731         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1732         {
1733             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1734             {
1735                 copy_rvec(v[c], lv[a++]);
1736             }
1737         }
1738     }
1739     else
1740     {
1741 #ifdef GMX_MPI
1742         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1743                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1744 #endif
1745     }
1746 }
1747
1748 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1749                                        rvec *v, rvec *lv)
1750 {
1751     gmx_domdec_master_t *ma;
1752     int                 *scounts = NULL, *disps = NULL;
1753     int                  n, i, c, a, nalloc = 0;
1754     rvec                *buf = NULL;
1755
1756     if (DDMASTER(dd))
1757     {
1758         ma  = dd->ma;
1759
1760         get_commbuffer_counts(dd, &scounts, &disps);
1761
1762         buf = ma->vbuf;
1763         a   = 0;
1764         for (n = 0; n < dd->nnodes; n++)
1765         {
1766             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1767             {
1768                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1769                 {
1770                     copy_rvec(v[c], buf[a++]);
1771                 }
1772             }
1773         }
1774     }
1775
1776     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1777 }
1778
1779 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1780 {
1781     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1782     {
1783         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1784     }
1785     else
1786     {
1787         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1788     }
1789 }
1790
1791 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1792 {
1793     int i;
1794     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1795     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1796     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1797
1798     if (dfhist->nlambda > 0)
1799     {
1800         int nlam = dfhist->nlambda;
1801         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1802         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1803         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1804         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1805         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1806         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1807
1808         for (i = 0; i < nlam; i++)
1809         {
1810             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1811             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1812             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1813             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1814             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1815             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1816         }
1817     }
1818 }
1819
1820 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1821                                 t_state *state, t_state *state_local,
1822                                 rvec **f)
1823 {
1824     int  i, j, nh;
1825
1826     nh = state->nhchainlength;
1827
1828     if (DDMASTER(dd))
1829     {
1830         for (i = 0; i < efptNR; i++)
1831         {
1832             state_local->lambda[i] = state->lambda[i];
1833         }
1834         state_local->fep_state = state->fep_state;
1835         state_local->veta      = state->veta;
1836         state_local->vol0      = state->vol0;
1837         copy_mat(state->box, state_local->box);
1838         copy_mat(state->box_rel, state_local->box_rel);
1839         copy_mat(state->boxv, state_local->boxv);
1840         copy_mat(state->svir_prev, state_local->svir_prev);
1841         copy_mat(state->fvir_prev, state_local->fvir_prev);
1842         copy_df_history(&state_local->dfhist, &state->dfhist);
1843         for (i = 0; i < state_local->ngtc; i++)
1844         {
1845             for (j = 0; j < nh; j++)
1846             {
1847                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1848                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1849             }
1850             state_local->therm_integral[i] = state->therm_integral[i];
1851         }
1852         for (i = 0; i < state_local->nnhpres; i++)
1853         {
1854             for (j = 0; j < nh; j++)
1855             {
1856                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1857                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1858             }
1859         }
1860     }
1861     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1862     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1863     dd_bcast(dd, sizeof(real), &state_local->veta);
1864     dd_bcast(dd, sizeof(real), &state_local->vol0);
1865     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1866     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1867     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1868     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1869     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1870     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1871     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1872     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1873     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1874     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1875
1876     /* communicate df_history -- required for restarting from checkpoint */
1877     dd_distribute_dfhist(dd, &state_local->dfhist);
1878
1879     if (dd->nat_home > state_local->nalloc)
1880     {
1881         dd_realloc_state(state_local, f, dd->nat_home);
1882     }
1883     for (i = 0; i < estNR; i++)
1884     {
1885         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1886         {
1887             switch (i)
1888             {
1889                 case estX:
1890                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1891                     break;
1892                 case estV:
1893                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1894                     break;
1895                 case estSDX:
1896                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1897                     break;
1898                 case estCGP:
1899                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1900                     break;
1901                 case estDISRE_INITF:
1902                 case estDISRE_RM3TAV:
1903                 case estORIRE_INITF:
1904                 case estORIRE_DTAV:
1905                     /* Not implemented yet */
1906                     break;
1907                 default:
1908                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1909             }
1910         }
1911     }
1912 }
1913
1914 static char dim2char(int dim)
1915 {
1916     char c = '?';
1917
1918     switch (dim)
1919     {
1920         case XX: c = 'X'; break;
1921         case YY: c = 'Y'; break;
1922         case ZZ: c = 'Z'; break;
1923         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1924     }
1925
1926     return c;
1927 }
1928
1929 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1930                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1931 {
1932     rvec   grid_s[2], *grid_r = NULL, cx, r;
1933     char   fname[STRLEN], buf[22];
1934     FILE  *out;
1935     int    a, i, d, z, y, x;
1936     matrix tric;
1937     real   vol;
1938
1939     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1940     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1941
1942     if (DDMASTER(dd))
1943     {
1944         snew(grid_r, 2*dd->nnodes);
1945     }
1946
1947     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1948
1949     if (DDMASTER(dd))
1950     {
1951         for (d = 0; d < DIM; d++)
1952         {
1953             for (i = 0; i < DIM; i++)
1954             {
1955                 if (d == i)
1956                 {
1957                     tric[d][i] = 1;
1958                 }
1959                 else
1960                 {
1961                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1962                     {
1963                         tric[d][i] = box[i][d]/box[i][i];
1964                     }
1965                     else
1966                     {
1967                         tric[d][i] = 0;
1968                     }
1969                 }
1970             }
1971         }
1972         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1973         out = gmx_fio_fopen(fname, "w");
1974         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1975         a = 1;
1976         for (i = 0; i < dd->nnodes; i++)
1977         {
1978             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1979             for (d = 0; d < DIM; d++)
1980             {
1981                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1982             }
1983             for (z = 0; z < 2; z++)
1984             {
1985                 for (y = 0; y < 2; y++)
1986                 {
1987                     for (x = 0; x < 2; x++)
1988                     {
1989                         cx[XX] = grid_r[i*2+x][XX];
1990                         cx[YY] = grid_r[i*2+y][YY];
1991                         cx[ZZ] = grid_r[i*2+z][ZZ];
1992                         mvmul(tric, cx, r);
1993                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1994                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1995                     }
1996                 }
1997             }
1998             for (d = 0; d < DIM; d++)
1999             {
2000                 for (x = 0; x < 4; x++)
2001                 {
2002                     switch (d)
2003                     {
2004                         case 0: y = 1 + i*8 + 2*x; break;
2005                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2006                         case 2: y = 1 + i*8 + x; break;
2007                     }
2008                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2009                 }
2010             }
2011         }
2012         gmx_fio_fclose(out);
2013         sfree(grid_r);
2014     }
2015 }
2016
2017 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
2018                   gmx_mtop_t *mtop, t_commrec *cr,
2019                   int natoms, rvec x[], matrix box)
2020 {
2021     char          fname[STRLEN], buf[22];
2022     FILE         *out;
2023     int           i, ii, resnr, c;
2024     char         *atomname, *resname;
2025     real          b;
2026     gmx_domdec_t *dd;
2027
2028     dd = cr->dd;
2029     if (natoms == -1)
2030     {
2031         natoms = dd->comm->nat[ddnatVSITE];
2032     }
2033
2034     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2035
2036     out = gmx_fio_fopen(fname, "w");
2037
2038     fprintf(out, "TITLE     %s\n", title);
2039     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2040     for (i = 0; i < natoms; i++)
2041     {
2042         ii = dd->gatindex[i];
2043         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2044         if (i < dd->comm->nat[ddnatZONE])
2045         {
2046             c = 0;
2047             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2048             {
2049                 c++;
2050             }
2051             b = c;
2052         }
2053         else if (i < dd->comm->nat[ddnatVSITE])
2054         {
2055             b = dd->comm->zones.n;
2056         }
2057         else
2058         {
2059             b = dd->comm->zones.n + 1;
2060         }
2061         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
2062                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
2063     }
2064     fprintf(out, "TER\n");
2065
2066     gmx_fio_fclose(out);
2067 }
2068
2069 real dd_cutoff_mbody(gmx_domdec_t *dd)
2070 {
2071     gmx_domdec_comm_t *comm;
2072     int                di;
2073     real               r;
2074
2075     comm = dd->comm;
2076
2077     r = -1;
2078     if (comm->bInterCGBondeds)
2079     {
2080         if (comm->cutoff_mbody > 0)
2081         {
2082             r = comm->cutoff_mbody;
2083         }
2084         else
2085         {
2086             /* cutoff_mbody=0 means we do not have DLB */
2087             r = comm->cellsize_min[dd->dim[0]];
2088             for (di = 1; di < dd->ndim; di++)
2089             {
2090                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2091             }
2092             if (comm->bBondComm)
2093             {
2094                 r = max(r, comm->cutoff_mbody);
2095             }
2096             else
2097             {
2098                 r = min(r, comm->cutoff);
2099             }
2100         }
2101     }
2102
2103     return r;
2104 }
2105
2106 real dd_cutoff_twobody(gmx_domdec_t *dd)
2107 {
2108     real r_mb;
2109
2110     r_mb = dd_cutoff_mbody(dd);
2111
2112     return max(dd->comm->cutoff, r_mb);
2113 }
2114
2115
2116 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2117 {
2118     int nc, ntot;
2119
2120     nc   = dd->nc[dd->comm->cartpmedim];
2121     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2122     copy_ivec(coord, coord_pme);
2123     coord_pme[dd->comm->cartpmedim] =
2124         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2125 }
2126
2127 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2128 {
2129     /* Here we assign a PME node to communicate with this DD node
2130      * by assuming that the major index of both is x.
2131      * We add cr->npmenodes/2 to obtain an even distribution.
2132      */
2133     return (ddindex*npme + npme/2)/ndd;
2134 }
2135
2136 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2137 {
2138     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2139 }
2140
2141 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2142 {
2143     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2144 }
2145
2146 static int *dd_pmenodes(t_commrec *cr)
2147 {
2148     int *pmenodes;
2149     int  n, i, p0, p1;
2150
2151     snew(pmenodes, cr->npmenodes);
2152     n = 0;
2153     for (i = 0; i < cr->dd->nnodes; i++)
2154     {
2155         p0 = cr_ddindex2pmeindex(cr, i);
2156         p1 = cr_ddindex2pmeindex(cr, i+1);
2157         if (i+1 == cr->dd->nnodes || p1 > p0)
2158         {
2159             if (debug)
2160             {
2161                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2162             }
2163             pmenodes[n] = i + 1 + n;
2164             n++;
2165         }
2166     }
2167
2168     return pmenodes;
2169 }
2170
2171 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2172 {
2173     gmx_domdec_t *dd;
2174     ivec          coords, coords_pme, nc;
2175     int           slab;
2176
2177     dd = cr->dd;
2178     /*
2179        if (dd->comm->bCartesian) {
2180        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2181        dd_coords2pmecoords(dd,coords,coords_pme);
2182        copy_ivec(dd->ntot,nc);
2183        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2184        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2185
2186        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2187        } else {
2188        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2189        }
2190      */
2191     coords[XX] = x;
2192     coords[YY] = y;
2193     coords[ZZ] = z;
2194     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2195
2196     return slab;
2197 }
2198
2199 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2200 {
2201     gmx_domdec_comm_t *comm;
2202     ivec               coords;
2203     int                ddindex, nodeid = -1;
2204
2205     comm = cr->dd->comm;
2206
2207     coords[XX] = x;
2208     coords[YY] = y;
2209     coords[ZZ] = z;
2210     if (comm->bCartesianPP_PME)
2211     {
2212 #ifdef GMX_MPI
2213         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2214 #endif
2215     }
2216     else
2217     {
2218         ddindex = dd_index(cr->dd->nc, coords);
2219         if (comm->bCartesianPP)
2220         {
2221             nodeid = comm->ddindex2simnodeid[ddindex];
2222         }
2223         else
2224         {
2225             if (comm->pmenodes)
2226             {
2227                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2228             }
2229             else
2230             {
2231                 nodeid = ddindex;
2232             }
2233         }
2234     }
2235
2236     return nodeid;
2237 }
2238
2239 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2240 {
2241     gmx_domdec_t      *dd;
2242     gmx_domdec_comm_t *comm;
2243     ivec               coord, coord_pme;
2244     int                i;
2245     int                pmenode = -1;
2246
2247     dd   = cr->dd;
2248     comm = dd->comm;
2249
2250     /* This assumes a uniform x domain decomposition grid cell size */
2251     if (comm->bCartesianPP_PME)
2252     {
2253 #ifdef GMX_MPI
2254         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2255         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2256         {
2257             /* This is a PP node */
2258             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2259             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2260         }
2261 #endif
2262     }
2263     else if (comm->bCartesianPP)
2264     {
2265         if (sim_nodeid < dd->nnodes)
2266         {
2267             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2268         }
2269     }
2270     else
2271     {
2272         /* This assumes DD cells with identical x coordinates
2273          * are numbered sequentially.
2274          */
2275         if (dd->comm->pmenodes == NULL)
2276         {
2277             if (sim_nodeid < dd->nnodes)
2278             {
2279                 /* The DD index equals the nodeid */
2280                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2281             }
2282         }
2283         else
2284         {
2285             i = 0;
2286             while (sim_nodeid > dd->comm->pmenodes[i])
2287             {
2288                 i++;
2289             }
2290             if (sim_nodeid < dd->comm->pmenodes[i])
2291             {
2292                 pmenode = dd->comm->pmenodes[i];
2293             }
2294         }
2295     }
2296
2297     return pmenode;
2298 }
2299
2300 void get_pme_nnodes(const gmx_domdec_t *dd,
2301                     int *npmenodes_x, int *npmenodes_y)
2302 {
2303     if (dd != NULL)
2304     {
2305         *npmenodes_x = dd->comm->npmenodes_x;
2306         *npmenodes_y = dd->comm->npmenodes_y;
2307     }
2308     else
2309     {
2310         *npmenodes_x = 1;
2311         *npmenodes_y = 1;
2312     }
2313 }
2314
2315 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2316 {
2317     gmx_bool bPMEOnlyNode;
2318
2319     if (DOMAINDECOMP(cr))
2320     {
2321         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2322     }
2323     else
2324     {
2325         bPMEOnlyNode = FALSE;
2326     }
2327
2328     return bPMEOnlyNode;
2329 }
2330
2331 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2332                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2333 {
2334     gmx_domdec_t *dd;
2335     int           x, y, z;
2336     ivec          coord, coord_pme;
2337
2338     dd = cr->dd;
2339
2340     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2341
2342     *nmy_ddnodes = 0;
2343     for (x = 0; x < dd->nc[XX]; x++)
2344     {
2345         for (y = 0; y < dd->nc[YY]; y++)
2346         {
2347             for (z = 0; z < dd->nc[ZZ]; z++)
2348             {
2349                 if (dd->comm->bCartesianPP_PME)
2350                 {
2351                     coord[XX] = x;
2352                     coord[YY] = y;
2353                     coord[ZZ] = z;
2354                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2355                     if (dd->ci[XX] == coord_pme[XX] &&
2356                         dd->ci[YY] == coord_pme[YY] &&
2357                         dd->ci[ZZ] == coord_pme[ZZ])
2358                     {
2359                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2360                     }
2361                 }
2362                 else
2363                 {
2364                     /* The slab corresponds to the nodeid in the PME group */
2365                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2366                     {
2367                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2368                     }
2369                 }
2370             }
2371         }
2372     }
2373
2374     /* The last PP-only node is the peer node */
2375     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2376
2377     if (debug)
2378     {
2379         fprintf(debug, "Receive coordinates from PP ranks:");
2380         for (x = 0; x < *nmy_ddnodes; x++)
2381         {
2382             fprintf(debug, " %d", (*my_ddnodes)[x]);
2383         }
2384         fprintf(debug, "\n");
2385     }
2386 }
2387
2388 static gmx_bool receive_vir_ener(t_commrec *cr)
2389 {
2390     gmx_domdec_comm_t *comm;
2391     int                pmenode, coords[DIM], rank;
2392     gmx_bool           bReceive;
2393
2394     bReceive = TRUE;
2395     if (cr->npmenodes < cr->dd->nnodes)
2396     {
2397         comm = cr->dd->comm;
2398         if (comm->bCartesianPP_PME)
2399         {
2400             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2401 #ifdef GMX_MPI
2402             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2403             coords[comm->cartpmedim]++;
2404             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2405             {
2406                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2407                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2408                 {
2409                     /* This is not the last PP node for pmenode */
2410                     bReceive = FALSE;
2411                 }
2412             }
2413 #endif
2414         }
2415         else
2416         {
2417             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2418             if (cr->sim_nodeid+1 < cr->nnodes &&
2419                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2420             {
2421                 /* This is not the last PP node for pmenode */
2422                 bReceive = FALSE;
2423             }
2424         }
2425     }
2426
2427     return bReceive;
2428 }
2429
2430 static void set_zones_ncg_home(gmx_domdec_t *dd)
2431 {
2432     gmx_domdec_zones_t *zones;
2433     int                 i;
2434
2435     zones = &dd->comm->zones;
2436
2437     zones->cg_range[0] = 0;
2438     for (i = 1; i < zones->n+1; i++)
2439     {
2440         zones->cg_range[i] = dd->ncg_home;
2441     }
2442     /* zone_ncg1[0] should always be equal to ncg_home */
2443     dd->comm->zone_ncg1[0] = dd->ncg_home;
2444 }
2445
2446 static void rebuild_cgindex(gmx_domdec_t *dd,
2447                             const int *gcgs_index, t_state *state)
2448 {
2449     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2450
2451     ind        = state->cg_gl;
2452     dd_cg_gl   = dd->index_gl;
2453     cgindex    = dd->cgindex;
2454     nat        = 0;
2455     cgindex[0] = nat;
2456     for (i = 0; i < state->ncg_gl; i++)
2457     {
2458         cgindex[i]  = nat;
2459         cg_gl       = ind[i];
2460         dd_cg_gl[i] = cg_gl;
2461         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2462     }
2463     cgindex[i] = nat;
2464
2465     dd->ncg_home = state->ncg_gl;
2466     dd->nat_home = nat;
2467
2468     set_zones_ncg_home(dd);
2469 }
2470
2471 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2472 {
2473     while (cg >= cginfo_mb->cg_end)
2474     {
2475         cginfo_mb++;
2476     }
2477
2478     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2479 }
2480
2481 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2482                           t_forcerec *fr, char *bLocalCG)
2483 {
2484     cginfo_mb_t *cginfo_mb;
2485     int         *cginfo;
2486     int          cg;
2487
2488     if (fr != NULL)
2489     {
2490         cginfo_mb = fr->cginfo_mb;
2491         cginfo    = fr->cginfo;
2492
2493         for (cg = cg0; cg < cg1; cg++)
2494         {
2495             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2496         }
2497     }
2498
2499     if (bLocalCG != NULL)
2500     {
2501         for (cg = cg0; cg < cg1; cg++)
2502         {
2503             bLocalCG[index_gl[cg]] = TRUE;
2504         }
2505     }
2506 }
2507
2508 static void make_dd_indices(gmx_domdec_t *dd,
2509                             const int *gcgs_index, int cg_start)
2510 {
2511     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2512     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2513     gmx_ga2la_t *ga2la;
2514     char        *bLocalCG;
2515     gmx_bool     bCGs;
2516
2517     bLocalCG = dd->comm->bLocalCG;
2518
2519     if (dd->nat_tot > dd->gatindex_nalloc)
2520     {
2521         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2522         srenew(dd->gatindex, dd->gatindex_nalloc);
2523     }
2524
2525     nzone      = dd->comm->zones.n;
2526     zone2cg    = dd->comm->zones.cg_range;
2527     zone_ncg1  = dd->comm->zone_ncg1;
2528     index_gl   = dd->index_gl;
2529     gatindex   = dd->gatindex;
2530     bCGs       = dd->comm->bCGs;
2531
2532     if (zone2cg[1] != dd->ncg_home)
2533     {
2534         gmx_incons("dd->ncg_zone is not up to date");
2535     }
2536
2537     /* Make the local to global and global to local atom index */
2538     a = dd->cgindex[cg_start];
2539     for (zone = 0; zone < nzone; zone++)
2540     {
2541         if (zone == 0)
2542         {
2543             cg0 = cg_start;
2544         }
2545         else
2546         {
2547             cg0 = zone2cg[zone];
2548         }
2549         cg1    = zone2cg[zone+1];
2550         cg1_p1 = cg0 + zone_ncg1[zone];
2551
2552         for (cg = cg0; cg < cg1; cg++)
2553         {
2554             zone1 = zone;
2555             if (cg >= cg1_p1)
2556             {
2557                 /* Signal that this cg is from more than one pulse away */
2558                 zone1 += nzone;
2559             }
2560             cg_gl = index_gl[cg];
2561             if (bCGs)
2562             {
2563                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2564                 {
2565                     gatindex[a] = a_gl;
2566                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2567                     a++;
2568                 }
2569             }
2570             else
2571             {
2572                 gatindex[a] = cg_gl;
2573                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2574                 a++;
2575             }
2576         }
2577     }
2578 }
2579
2580 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2581                           const char *where)
2582 {
2583     int ncg, i, ngl, nerr;
2584
2585     nerr = 0;
2586     if (bLocalCG == NULL)
2587     {
2588         return nerr;
2589     }
2590     for (i = 0; i < dd->ncg_tot; i++)
2591     {
2592         if (!bLocalCG[dd->index_gl[i]])
2593         {
2594             fprintf(stderr,
2595                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2596             nerr++;
2597         }
2598     }
2599     ngl = 0;
2600     for (i = 0; i < ncg_sys; i++)
2601     {
2602         if (bLocalCG[i])
2603         {
2604             ngl++;
2605         }
2606     }
2607     if (ngl != dd->ncg_tot)
2608     {
2609         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2610         nerr++;
2611     }
2612
2613     return nerr;
2614 }
2615
2616 static void check_index_consistency(gmx_domdec_t *dd,
2617                                     int natoms_sys, int ncg_sys,
2618                                     const char *where)
2619 {
2620     int   nerr, ngl, i, a, cell;
2621     int  *have;
2622
2623     nerr = 0;
2624
2625     if (dd->comm->DD_debug > 1)
2626     {
2627         snew(have, natoms_sys);
2628         for (a = 0; a < dd->nat_tot; a++)
2629         {
2630             if (have[dd->gatindex[a]] > 0)
2631             {
2632                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2633             }
2634             else
2635             {
2636                 have[dd->gatindex[a]] = a + 1;
2637             }
2638         }
2639         sfree(have);
2640     }
2641
2642     snew(have, dd->nat_tot);
2643
2644     ngl  = 0;
2645     for (i = 0; i < natoms_sys; i++)
2646     {
2647         if (ga2la_get(dd->ga2la, i, &a, &cell))
2648         {
2649             if (a >= dd->nat_tot)
2650             {
2651                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2652                 nerr++;
2653             }
2654             else
2655             {
2656                 have[a] = 1;
2657                 if (dd->gatindex[a] != i)
2658                 {
2659                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2660                     nerr++;
2661                 }
2662             }
2663             ngl++;
2664         }
2665     }
2666     if (ngl != dd->nat_tot)
2667     {
2668         fprintf(stderr,
2669                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2670                 dd->rank, where, ngl, dd->nat_tot);
2671     }
2672     for (a = 0; a < dd->nat_tot; a++)
2673     {
2674         if (have[a] == 0)
2675         {
2676             fprintf(stderr,
2677                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2678                     dd->rank, where, a+1, dd->gatindex[a]+1);
2679         }
2680     }
2681     sfree(have);
2682
2683     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2684
2685     if (nerr > 0)
2686     {
2687         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2688                   dd->rank, where, nerr);
2689     }
2690 }
2691
2692 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2693 {
2694     int   i;
2695     char *bLocalCG;
2696
2697     if (a_start == 0)
2698     {
2699         /* Clear the whole list without searching */
2700         ga2la_clear(dd->ga2la);
2701     }
2702     else
2703     {
2704         for (i = a_start; i < dd->nat_tot; i++)
2705         {
2706             ga2la_del(dd->ga2la, dd->gatindex[i]);
2707         }
2708     }
2709
2710     bLocalCG = dd->comm->bLocalCG;
2711     if (bLocalCG)
2712     {
2713         for (i = cg_start; i < dd->ncg_tot; i++)
2714         {
2715             bLocalCG[dd->index_gl[i]] = FALSE;
2716         }
2717     }
2718
2719     dd_clear_local_vsite_indices(dd);
2720
2721     if (dd->constraints)
2722     {
2723         dd_clear_local_constraint_indices(dd);
2724     }
2725 }
2726
2727 /* This function should be used for moving the domain boudaries during DLB,
2728  * for obtaining the minimum cell size. It checks the initially set limit
2729  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2730  * and, possibly, a longer cut-off limit set for PME load balancing.
2731  */
2732 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2733 {
2734     real cellsize_min;
2735
2736     cellsize_min = comm->cellsize_min[dim];
2737
2738     if (!comm->bVacDLBNoLimit)
2739     {
2740         /* The cut-off might have changed, e.g. by PME load balacning,
2741          * from the value used to set comm->cellsize_min, so check it.
2742          */
2743         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2744
2745         if (comm->bPMELoadBalDLBLimits)
2746         {
2747             /* Check for the cut-off limit set by the PME load balancing */
2748             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2749         }
2750     }
2751
2752     return cellsize_min;
2753 }
2754
2755 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2756                             int dim_ind)
2757 {
2758     real grid_jump_limit;
2759
2760     /* The distance between the boundaries of cells at distance
2761      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2762      * and by the fact that cells should not be shifted by more than
2763      * half their size, such that cg's only shift by one cell
2764      * at redecomposition.
2765      */
2766     grid_jump_limit = comm->cellsize_limit;
2767     if (!comm->bVacDLBNoLimit)
2768     {
2769         if (comm->bPMELoadBalDLBLimits)
2770         {
2771             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2772         }
2773         grid_jump_limit = max(grid_jump_limit,
2774                               cutoff/comm->cd[dim_ind].np);
2775     }
2776
2777     return grid_jump_limit;
2778 }
2779
2780 static gmx_bool check_grid_jump(gmx_int64_t     step,
2781                                 gmx_domdec_t   *dd,
2782                                 real            cutoff,
2783                                 gmx_ddbox_t    *ddbox,
2784                                 gmx_bool        bFatal)
2785 {
2786     gmx_domdec_comm_t *comm;
2787     int                d, dim;
2788     real               limit, bfac;
2789     gmx_bool           bInvalid;
2790
2791     bInvalid = FALSE;
2792
2793     comm = dd->comm;
2794
2795     for (d = 1; d < dd->ndim; d++)
2796     {
2797         dim   = dd->dim[d];
2798         limit = grid_jump_limit(comm, cutoff, d);
2799         bfac  = ddbox->box_size[dim];
2800         if (ddbox->tric_dir[dim])
2801         {
2802             bfac *= ddbox->skew_fac[dim];
2803         }
2804         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2805                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2806         {
2807             bInvalid = TRUE;
2808
2809             if (bFatal)
2810             {
2811                 char buf[22];
2812
2813                 /* This error should never be triggered under normal
2814                  * circumstances, but you never know ...
2815                  */
2816                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2817                           gmx_step_str(step, buf),
2818                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2819             }
2820         }
2821     }
2822
2823     return bInvalid;
2824 }
2825
2826 static int dd_load_count(gmx_domdec_comm_t *comm)
2827 {
2828     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2829 }
2830
2831 static float dd_force_load(gmx_domdec_comm_t *comm)
2832 {
2833     float load;
2834
2835     if (comm->eFlop)
2836     {
2837         load = comm->flop;
2838         if (comm->eFlop > 1)
2839         {
2840             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2841         }
2842     }
2843     else
2844     {
2845         load = comm->cycl[ddCyclF];
2846         if (comm->cycl_n[ddCyclF] > 1)
2847         {
2848             /* Subtract the maximum of the last n cycle counts
2849              * to get rid of possible high counts due to other sources,
2850              * for instance system activity, that would otherwise
2851              * affect the dynamic load balancing.
2852              */
2853             load -= comm->cycl_max[ddCyclF];
2854         }
2855
2856 #ifdef GMX_MPI
2857         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2858         {
2859             float gpu_wait, gpu_wait_sum;
2860
2861             gpu_wait = comm->cycl[ddCyclWaitGPU];
2862             if (comm->cycl_n[ddCyclF] > 1)
2863             {
2864                 /* We should remove the WaitGPU time of the same MD step
2865                  * as the one with the maximum F time, since the F time
2866                  * and the wait time are not independent.
2867                  * Furthermore, the step for the max F time should be chosen
2868                  * the same on all ranks that share the same GPU.
2869                  * But to keep the code simple, we remove the average instead.
2870                  * The main reason for artificially long times at some steps
2871                  * is spurious CPU activity or MPI time, so we don't expect
2872                  * that changes in the GPU wait time matter a lot here.
2873                  */
2874                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2875             }
2876             /* Sum the wait times over the ranks that share the same GPU */
2877             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2878                           comm->mpi_comm_gpu_shared);
2879             /* Replace the wait time by the average over the ranks */
2880             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2881         }
2882 #endif
2883     }
2884
2885     return load;
2886 }
2887
2888 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2889 {
2890     gmx_domdec_comm_t *comm;
2891     int                i;
2892
2893     comm = dd->comm;
2894
2895     snew(*dim_f, dd->nc[dim]+1);
2896     (*dim_f)[0] = 0;
2897     for (i = 1; i < dd->nc[dim]; i++)
2898     {
2899         if (comm->slb_frac[dim])
2900         {
2901             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2902         }
2903         else
2904         {
2905             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2906         }
2907     }
2908     (*dim_f)[dd->nc[dim]] = 1;
2909 }
2910
2911 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2912 {
2913     int  pmeindex, slab, nso, i;
2914     ivec xyz;
2915
2916     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2917     {
2918         ddpme->dim = YY;
2919     }
2920     else
2921     {
2922         ddpme->dim = dimind;
2923     }
2924     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2925
2926     ddpme->nslab = (ddpme->dim == 0 ?
2927                     dd->comm->npmenodes_x :
2928                     dd->comm->npmenodes_y);
2929
2930     if (ddpme->nslab <= 1)
2931     {
2932         return;
2933     }
2934
2935     nso = dd->comm->npmenodes/ddpme->nslab;
2936     /* Determine for each PME slab the PP location range for dimension dim */
2937     snew(ddpme->pp_min, ddpme->nslab);
2938     snew(ddpme->pp_max, ddpme->nslab);
2939     for (slab = 0; slab < ddpme->nslab; slab++)
2940     {
2941         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2942         ddpme->pp_max[slab] = 0;
2943     }
2944     for (i = 0; i < dd->nnodes; i++)
2945     {
2946         ddindex2xyz(dd->nc, i, xyz);
2947         /* For y only use our y/z slab.
2948          * This assumes that the PME x grid size matches the DD grid size.
2949          */
2950         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2951         {
2952             pmeindex = ddindex2pmeindex(dd, i);
2953             if (dimind == 0)
2954             {
2955                 slab = pmeindex/nso;
2956             }
2957             else
2958             {
2959                 slab = pmeindex % ddpme->nslab;
2960             }
2961             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2962             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2963         }
2964     }
2965
2966     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2967 }
2968
2969 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2970 {
2971     if (dd->comm->ddpme[0].dim == XX)
2972     {
2973         return dd->comm->ddpme[0].maxshift;
2974     }
2975     else
2976     {
2977         return 0;
2978     }
2979 }
2980
2981 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2982 {
2983     if (dd->comm->ddpme[0].dim == YY)
2984     {
2985         return dd->comm->ddpme[0].maxshift;
2986     }
2987     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2988     {
2989         return dd->comm->ddpme[1].maxshift;
2990     }
2991     else
2992     {
2993         return 0;
2994     }
2995 }
2996
2997 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2998                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2999 {
3000     gmx_domdec_comm_t *comm;
3001     int                nc, ns, s;
3002     int               *xmin, *xmax;
3003     real               range, pme_boundary;
3004     int                sh;
3005
3006     comm = dd->comm;
3007     nc   = dd->nc[ddpme->dim];
3008     ns   = ddpme->nslab;
3009
3010     if (!ddpme->dim_match)
3011     {
3012         /* PP decomposition is not along dim: the worst situation */
3013         sh = ns/2;
3014     }
3015     else if (ns <= 3 || (bUniform && ns == nc))
3016     {
3017         /* The optimal situation */
3018         sh = 1;
3019     }
3020     else
3021     {
3022         /* We need to check for all pme nodes which nodes they
3023          * could possibly need to communicate with.
3024          */
3025         xmin = ddpme->pp_min;
3026         xmax = ddpme->pp_max;
3027         /* Allow for atoms to be maximally 2/3 times the cut-off
3028          * out of their DD cell. This is a reasonable balance between
3029          * between performance and support for most charge-group/cut-off
3030          * combinations.
3031          */
3032         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3033         /* Avoid extra communication when we are exactly at a boundary */
3034         range *= 0.999;
3035
3036         sh = 1;
3037         for (s = 0; s < ns; s++)
3038         {
3039             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3040             pme_boundary = (real)s/ns;
3041             while (sh+1 < ns &&
3042                    ((s-(sh+1) >= 0 &&
3043                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3044                     (s-(sh+1) <  0 &&
3045                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3046             {
3047                 sh++;
3048             }
3049             pme_boundary = (real)(s+1)/ns;
3050             while (sh+1 < ns &&
3051                    ((s+(sh+1) <  ns &&
3052                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3053                     (s+(sh+1) >= ns &&
3054                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3055             {
3056                 sh++;
3057             }
3058         }
3059     }
3060
3061     ddpme->maxshift = sh;
3062
3063     if (debug)
3064     {
3065         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3066                 ddpme->dim, ddpme->maxshift);
3067     }
3068 }
3069
3070 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3071 {
3072     int d, dim;
3073
3074     for (d = 0; d < dd->ndim; d++)
3075     {
3076         dim = dd->dim[d];
3077         if (dim < ddbox->nboundeddim &&
3078             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3079             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3080         {
3081             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3082                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3083                       dd->nc[dim], dd->comm->cellsize_limit);
3084         }
3085     }
3086 }
3087
3088 enum {
3089     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
3090 };
3091
3092 /* Set the domain boundaries. Use for static (or no) load balancing,
3093  * and also for the starting state for dynamic load balancing.
3094  * setmode determine if and where the boundaries are stored, use enum above.
3095  * Returns the number communication pulses in npulse.
3096  */
3097 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3098                                   int setmode, ivec npulse)
3099 {
3100     gmx_domdec_comm_t *comm;
3101     int                d, j;
3102     rvec               cellsize_min;
3103     real              *cell_x, cell_dx, cellsize;
3104
3105     comm = dd->comm;
3106
3107     for (d = 0; d < DIM; d++)
3108     {
3109         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3110         npulse[d]       = 1;
3111         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3112         {
3113             /* Uniform grid */
3114             cell_dx = ddbox->box_size[d]/dd->nc[d];
3115             switch (setmode)
3116             {
3117                 case setcellsizeslbMASTER:
3118                     for (j = 0; j < dd->nc[d]+1; j++)
3119                     {
3120                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3121                     }
3122                     break;
3123                 case setcellsizeslbLOCAL:
3124                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3125                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3126                     break;
3127                 default:
3128                     break;
3129             }
3130             cellsize = cell_dx*ddbox->skew_fac[d];
3131             while (cellsize*npulse[d] < comm->cutoff)
3132             {
3133                 npulse[d]++;
3134             }
3135             cellsize_min[d] = cellsize;
3136         }
3137         else
3138         {
3139             /* Statically load balanced grid */
3140             /* Also when we are not doing a master distribution we determine
3141              * all cell borders in a loop to obtain identical values
3142              * to the master distribution case and to determine npulse.
3143              */
3144             if (setmode == setcellsizeslbMASTER)
3145             {
3146                 cell_x = dd->ma->cell_x[d];
3147             }
3148             else
3149             {
3150                 snew(cell_x, dd->nc[d]+1);
3151             }
3152             cell_x[0] = ddbox->box0[d];
3153             for (j = 0; j < dd->nc[d]; j++)
3154             {
3155                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3156                 cell_x[j+1] = cell_x[j] + cell_dx;
3157                 cellsize    = cell_dx*ddbox->skew_fac[d];
3158                 while (cellsize*npulse[d] < comm->cutoff &&
3159                        npulse[d] < dd->nc[d]-1)
3160                 {
3161                     npulse[d]++;
3162                 }
3163                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3164             }
3165             if (setmode == setcellsizeslbLOCAL)
3166             {
3167                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3168                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3169             }
3170             if (setmode != setcellsizeslbMASTER)
3171             {
3172                 sfree(cell_x);
3173             }
3174         }
3175         /* The following limitation is to avoid that a cell would receive
3176          * some of its own home charge groups back over the periodic boundary.
3177          * Double charge groups cause trouble with the global indices.
3178          */
3179         if (d < ddbox->npbcdim &&
3180             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3181         {
3182             char error_string[STRLEN];
3183
3184             sprintf(error_string,
3185                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3186                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3187                     comm->cutoff,
3188                     dd->nc[d], dd->nc[d],
3189                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
3190
3191             if (setmode == setcellsizeslbLOCAL)
3192             {
3193                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
3194             }
3195             else
3196             {
3197                 gmx_fatal(FARGS, error_string);
3198             }
3199         }
3200     }
3201
3202     if (!comm->bDynLoadBal)
3203     {
3204         copy_rvec(cellsize_min, comm->cellsize_min);
3205     }
3206
3207     for (d = 0; d < comm->npmedecompdim; d++)
3208     {
3209         set_pme_maxshift(dd, &comm->ddpme[d],
3210                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3211                          comm->ddpme[d].slb_dim_f);
3212     }
3213 }
3214
3215
3216 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3217                                                   int d, int dim, gmx_domdec_root_t *root,
3218                                                   gmx_ddbox_t *ddbox,
3219                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
3220 {
3221     gmx_domdec_comm_t *comm;
3222     int                ncd, i, j, nmin, nmin_old;
3223     gmx_bool           bLimLo, bLimHi;
3224     real              *cell_size;
3225     real               fac, halfway, cellsize_limit_f_i, region_size;
3226     gmx_bool           bPBC, bLastHi = FALSE;
3227     int                nrange[] = {range[0], range[1]};
3228
3229     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3230
3231     comm = dd->comm;
3232
3233     ncd = dd->nc[dim];
3234
3235     bPBC = (dim < ddbox->npbcdim);
3236
3237     cell_size = root->buf_ncd;
3238
3239     if (debug)
3240     {
3241         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3242     }
3243
3244     /* First we need to check if the scaling does not make cells
3245      * smaller than the smallest allowed size.
3246      * We need to do this iteratively, since if a cell is too small,
3247      * it needs to be enlarged, which makes all the other cells smaller,
3248      * which could in turn make another cell smaller than allowed.
3249      */
3250     for (i = range[0]; i < range[1]; i++)
3251     {
3252         root->bCellMin[i] = FALSE;
3253     }
3254     nmin = 0;
3255     do
3256     {
3257         nmin_old = nmin;
3258         /* We need the total for normalization */
3259         fac = 0;
3260         for (i = range[0]; i < range[1]; i++)
3261         {
3262             if (root->bCellMin[i] == FALSE)
3263             {
3264                 fac += cell_size[i];
3265             }
3266         }
3267         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3268         /* Determine the cell boundaries */
3269         for (i = range[0]; i < range[1]; i++)
3270         {
3271             if (root->bCellMin[i] == FALSE)
3272             {
3273                 cell_size[i] *= fac;
3274                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3275                 {
3276                     cellsize_limit_f_i = 0;
3277                 }
3278                 else
3279                 {
3280                     cellsize_limit_f_i = cellsize_limit_f;
3281                 }
3282                 if (cell_size[i] < cellsize_limit_f_i)
3283                 {
3284                     root->bCellMin[i] = TRUE;
3285                     cell_size[i]      = cellsize_limit_f_i;
3286                     nmin++;
3287                 }
3288             }
3289             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3290         }
3291     }
3292     while (nmin > nmin_old);
3293
3294     i            = range[1]-1;
3295     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3296     /* For this check we should not use DD_CELL_MARGIN,
3297      * but a slightly smaller factor,
3298      * since rounding could get use below the limit.
3299      */
3300     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3301     {
3302         char buf[22];
3303         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3304                   gmx_step_str(step, buf),
3305                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3306                   ncd, comm->cellsize_min[dim]);
3307     }
3308
3309     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3310
3311     if (!bUniform)
3312     {
3313         /* Check if the boundary did not displace more than halfway
3314          * each of the cells it bounds, as this could cause problems,
3315          * especially when the differences between cell sizes are large.
3316          * If changes are applied, they will not make cells smaller
3317          * than the cut-off, as we check all the boundaries which
3318          * might be affected by a change and if the old state was ok,
3319          * the cells will at most be shrunk back to their old size.
3320          */
3321         for (i = range[0]+1; i < range[1]; i++)
3322         {
3323             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3324             if (root->cell_f[i] < halfway)
3325             {
3326                 root->cell_f[i] = halfway;
3327                 /* Check if the change also causes shifts of the next boundaries */
3328                 for (j = i+1; j < range[1]; j++)
3329                 {
3330                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3331                     {
3332                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3333                     }
3334                 }
3335             }
3336             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3337             if (root->cell_f[i] > halfway)
3338             {
3339                 root->cell_f[i] = halfway;
3340                 /* Check if the change also causes shifts of the next boundaries */
3341                 for (j = i-1; j >= range[0]+1; j--)
3342                 {
3343                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3344                     {
3345                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3346                     }
3347                 }
3348             }
3349         }
3350     }
3351
3352     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3353     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3354      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3355      * for a and b nrange is used */
3356     if (d > 0)
3357     {
3358         /* Take care of the staggering of the cell boundaries */
3359         if (bUniform)
3360         {
3361             for (i = range[0]; i < range[1]; i++)
3362             {
3363                 root->cell_f_max0[i] = root->cell_f[i];
3364                 root->cell_f_min1[i] = root->cell_f[i+1];
3365             }
3366         }
3367         else
3368         {
3369             for (i = range[0]+1; i < range[1]; i++)
3370             {
3371                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3372                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3373                 if (bLimLo && bLimHi)
3374                 {
3375                     /* Both limits violated, try the best we can */
3376                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3377                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3378                     nrange[0]       = range[0];
3379                     nrange[1]       = i;
3380                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3381
3382                     nrange[0] = i;
3383                     nrange[1] = range[1];
3384                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3385
3386                     return;
3387                 }
3388                 else if (bLimLo)
3389                 {
3390                     /* root->cell_f[i] = root->bound_min[i]; */
3391                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3392                     bLastHi   = FALSE;
3393                 }
3394                 else if (bLimHi && !bLastHi)
3395                 {
3396                     bLastHi = TRUE;
3397                     if (nrange[1] < range[1])   /* found a LimLo before */
3398                     {
3399                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3400                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3401                         nrange[0] = nrange[1];
3402                     }
3403                     root->cell_f[i] = root->bound_max[i];
3404                     nrange[1]       = i;
3405                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3406                     nrange[0] = i;
3407                     nrange[1] = range[1];
3408                 }
3409             }
3410             if (nrange[1] < range[1])   /* found last a LimLo */
3411             {
3412                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3413                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3414                 nrange[0] = nrange[1];
3415                 nrange[1] = range[1];
3416                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3417             }
3418             else if (nrange[0] > range[0]) /* found at least one LimHi */
3419             {
3420                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3421             }
3422         }
3423     }
3424 }
3425
3426
3427 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3428                                        int d, int dim, gmx_domdec_root_t *root,
3429                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3430                                        gmx_bool bUniform, gmx_int64_t step)
3431 {
3432     gmx_domdec_comm_t *comm;
3433     int                ncd, d1, i, j, pos;
3434     real              *cell_size;
3435     real               load_aver, load_i, imbalance, change, change_max, sc;
3436     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3437     real               change_limit;
3438     real               relax = 0.5;
3439     gmx_bool           bPBC;
3440     int                range[] = { 0, 0 };
3441
3442     comm = dd->comm;
3443
3444     /* Convert the maximum change from the input percentage to a fraction */
3445     change_limit = comm->dlb_scale_lim*0.01;
3446
3447     ncd = dd->nc[dim];
3448
3449     bPBC = (dim < ddbox->npbcdim);
3450
3451     cell_size = root->buf_ncd;
3452
3453     /* Store the original boundaries */
3454     for (i = 0; i < ncd+1; i++)
3455     {
3456         root->old_cell_f[i] = root->cell_f[i];
3457     }
3458     if (bUniform)
3459     {
3460         for (i = 0; i < ncd; i++)
3461         {
3462             cell_size[i] = 1.0/ncd;
3463         }
3464     }
3465     else if (dd_load_count(comm))
3466     {
3467         load_aver  = comm->load[d].sum_m/ncd;
3468         change_max = 0;
3469         for (i = 0; i < ncd; i++)
3470         {
3471             /* Determine the relative imbalance of cell i */
3472             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3473             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3474             /* Determine the change of the cell size using underrelaxation */
3475             change     = -relax*imbalance;
3476             change_max = max(change_max, max(change, -change));
3477         }
3478         /* Limit the amount of scaling.
3479          * We need to use the same rescaling for all cells in one row,
3480          * otherwise the load balancing might not converge.
3481          */
3482         sc = relax;
3483         if (change_max > change_limit)
3484         {
3485             sc *= change_limit/change_max;
3486         }
3487         for (i = 0; i < ncd; i++)
3488         {
3489             /* Determine the relative imbalance of cell i */
3490             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3491             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3492             /* Determine the change of the cell size using underrelaxation */
3493             change       = -sc*imbalance;
3494             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3495         }
3496     }
3497
3498     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3499     cellsize_limit_f *= DD_CELL_MARGIN;
3500     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3501     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3502     if (ddbox->tric_dir[dim])
3503     {
3504         cellsize_limit_f /= ddbox->skew_fac[dim];
3505         dist_min_f       /= ddbox->skew_fac[dim];
3506     }
3507     if (bDynamicBox && d > 0)
3508     {
3509         dist_min_f *= DD_PRES_SCALE_MARGIN;
3510     }
3511     if (d > 0 && !bUniform)
3512     {
3513         /* Make sure that the grid is not shifted too much */
3514         for (i = 1; i < ncd; i++)
3515         {
3516             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3517             {
3518                 gmx_incons("Inconsistent DD boundary staggering limits!");
3519             }
3520             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3521             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3522             if (space > 0)
3523             {
3524                 root->bound_min[i] += 0.5*space;
3525             }
3526             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3527             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3528             if (space < 0)
3529             {
3530                 root->bound_max[i] += 0.5*space;
3531             }
3532             if (debug)
3533             {
3534                 fprintf(debug,
3535                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3536                         d, i,
3537                         root->cell_f_max0[i-1] + dist_min_f,
3538                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3539                         root->cell_f_min1[i] - dist_min_f);
3540             }
3541         }
3542     }
3543     range[1]          = ncd;
3544     root->cell_f[0]   = 0;
3545     root->cell_f[ncd] = 1;
3546     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3547
3548
3549     /* After the checks above, the cells should obey the cut-off
3550      * restrictions, but it does not hurt to check.
3551      */
3552     for (i = 0; i < ncd; i++)
3553     {
3554         if (debug)
3555         {
3556             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3557                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3558         }
3559
3560         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3561             root->cell_f[i+1] - root->cell_f[i] <
3562             cellsize_limit_f/DD_CELL_MARGIN)
3563         {
3564             char buf[22];
3565             fprintf(stderr,
3566                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3567                     gmx_step_str(step, buf), dim2char(dim), i,
3568                     (root->cell_f[i+1] - root->cell_f[i])
3569                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3570         }
3571     }
3572
3573     pos = ncd + 1;
3574     /* Store the cell boundaries of the lower dimensions at the end */
3575     for (d1 = 0; d1 < d; d1++)
3576     {
3577         root->cell_f[pos++] = comm->cell_f0[d1];
3578         root->cell_f[pos++] = comm->cell_f1[d1];
3579     }
3580
3581     if (d < comm->npmedecompdim)
3582     {
3583         /* The master determines the maximum shift for
3584          * the coordinate communication between separate PME nodes.
3585          */
3586         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3587     }
3588     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3589     if (d >= 1)
3590     {
3591         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3592     }
3593 }
3594
3595 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3596                                              gmx_ddbox_t *ddbox, int dimind)
3597 {
3598     gmx_domdec_comm_t *comm;
3599     int                dim;
3600
3601     comm = dd->comm;
3602
3603     /* Set the cell dimensions */
3604     dim                = dd->dim[dimind];
3605     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3606     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3607     if (dim >= ddbox->nboundeddim)
3608     {
3609         comm->cell_x0[dim] += ddbox->box0[dim];
3610         comm->cell_x1[dim] += ddbox->box0[dim];
3611     }
3612 }
3613
3614 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3615                                          int d, int dim, real *cell_f_row,
3616                                          gmx_ddbox_t *ddbox)
3617 {
3618     gmx_domdec_comm_t *comm;
3619     int                d1, dim1, pos;
3620
3621     comm = dd->comm;
3622
3623 #ifdef GMX_MPI
3624     /* Each node would only need to know two fractions,
3625      * but it is probably cheaper to broadcast the whole array.
3626      */
3627     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3628               0, comm->mpi_comm_load[d]);
3629 #endif
3630     /* Copy the fractions for this dimension from the buffer */
3631     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3632     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3633     /* The whole array was communicated, so set the buffer position */
3634     pos = dd->nc[dim] + 1;
3635     for (d1 = 0; d1 <= d; d1++)
3636     {
3637         if (d1 < d)
3638         {
3639             /* Copy the cell fractions of the lower dimensions */
3640             comm->cell_f0[d1] = cell_f_row[pos++];
3641             comm->cell_f1[d1] = cell_f_row[pos++];
3642         }
3643         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3644     }
3645     /* Convert the communicated shift from float to int */
3646     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3647     if (d >= 1)
3648     {
3649         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3650     }
3651 }
3652
3653 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3654                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3655                                          gmx_bool bUniform, gmx_int64_t step)
3656 {
3657     gmx_domdec_comm_t *comm;
3658     int                d, dim, d1;
3659     gmx_bool           bRowMember, bRowRoot;
3660     real              *cell_f_row;
3661
3662     comm = dd->comm;
3663
3664     for (d = 0; d < dd->ndim; d++)
3665     {
3666         dim        = dd->dim[d];
3667         bRowMember = TRUE;
3668         bRowRoot   = TRUE;
3669         for (d1 = d; d1 < dd->ndim; d1++)
3670         {
3671             if (dd->ci[dd->dim[d1]] > 0)
3672             {
3673                 if (d1 != d)
3674                 {
3675                     bRowMember = FALSE;
3676                 }
3677                 bRowRoot = FALSE;
3678             }
3679         }
3680         if (bRowMember)
3681         {
3682             if (bRowRoot)
3683             {
3684                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3685                                            ddbox, bDynamicBox, bUniform, step);
3686                 cell_f_row = comm->root[d]->cell_f;
3687             }
3688             else
3689             {
3690                 cell_f_row = comm->cell_f_row;
3691             }
3692             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3693         }
3694     }
3695 }
3696
3697 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3698 {
3699     int d;
3700
3701     /* This function assumes the box is static and should therefore
3702      * not be called when the box has changed since the last
3703      * call to dd_partition_system.
3704      */
3705     for (d = 0; d < dd->ndim; d++)
3706     {
3707         relative_to_absolute_cell_bounds(dd, ddbox, d);
3708     }
3709 }
3710
3711
3712
3713 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3714                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3715                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3716                                   gmx_wallcycle_t wcycle)
3717 {
3718     gmx_domdec_comm_t *comm;
3719     int                dim;
3720
3721     comm = dd->comm;
3722
3723     if (bDoDLB)
3724     {
3725         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3726         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3727         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3728     }
3729     else if (bDynamicBox)
3730     {
3731         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3732     }
3733
3734     /* Set the dimensions for which no DD is used */
3735     for (dim = 0; dim < DIM; dim++)
3736     {
3737         if (dd->nc[dim] == 1)
3738         {
3739             comm->cell_x0[dim] = 0;
3740             comm->cell_x1[dim] = ddbox->box_size[dim];
3741             if (dim >= ddbox->nboundeddim)
3742             {
3743                 comm->cell_x0[dim] += ddbox->box0[dim];
3744                 comm->cell_x1[dim] += ddbox->box0[dim];
3745             }
3746         }
3747     }
3748 }
3749
3750 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3751 {
3752     int                    d, np, i;
3753     gmx_domdec_comm_dim_t *cd;
3754
3755     for (d = 0; d < dd->ndim; d++)
3756     {
3757         cd = &dd->comm->cd[d];
3758         np = npulse[dd->dim[d]];
3759         if (np > cd->np_nalloc)
3760         {
3761             if (debug)
3762             {
3763                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3764                         dim2char(dd->dim[d]), np);
3765             }
3766             if (DDMASTER(dd) && cd->np_nalloc > 0)
3767             {
3768                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3769             }
3770             srenew(cd->ind, np);
3771             for (i = cd->np_nalloc; i < np; i++)
3772             {
3773                 cd->ind[i].index  = NULL;
3774                 cd->ind[i].nalloc = 0;
3775             }
3776             cd->np_nalloc = np;
3777         }
3778         cd->np = np;
3779     }
3780 }
3781
3782
3783 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3784                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3785                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3786                               gmx_wallcycle_t wcycle)
3787 {
3788     gmx_domdec_comm_t *comm;
3789     int                d;
3790     ivec               npulse;
3791
3792     comm = dd->comm;
3793
3794     /* Copy the old cell boundaries for the cg displacement check */
3795     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3796     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3797
3798     if (comm->bDynLoadBal)
3799     {
3800         if (DDMASTER(dd))
3801         {
3802             check_box_size(dd, ddbox);
3803         }
3804         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3805     }
3806     else
3807     {
3808         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3809         realloc_comm_ind(dd, npulse);
3810     }
3811
3812     if (debug)
3813     {
3814         for (d = 0; d < DIM; d++)
3815         {
3816             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3817                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3818         }
3819     }
3820 }
3821
3822 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3823                                   gmx_ddbox_t *ddbox,
3824                                   rvec cell_ns_x0, rvec cell_ns_x1,
3825                                   gmx_int64_t step)
3826 {
3827     gmx_domdec_comm_t *comm;
3828     int                dim_ind, dim;
3829
3830     comm = dd->comm;
3831
3832     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3833     {
3834         dim = dd->dim[dim_ind];
3835
3836         /* Without PBC we don't have restrictions on the outer cells */
3837         if (!(dim >= ddbox->npbcdim &&
3838               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3839             comm->bDynLoadBal &&
3840             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3841             comm->cellsize_min[dim])
3842         {
3843             char buf[22];
3844             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3845                       gmx_step_str(step, buf), dim2char(dim),
3846                       comm->cell_x1[dim] - comm->cell_x0[dim],
3847                       ddbox->skew_fac[dim],
3848                       dd->comm->cellsize_min[dim],
3849                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3850         }
3851     }
3852
3853     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3854     {
3855         /* Communicate the boundaries and update cell_ns_x0/1 */
3856         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3857         if (dd->bGridJump && dd->ndim > 1)
3858         {
3859             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3860         }
3861     }
3862 }
3863
3864 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3865 {
3866     if (YY < npbcdim)
3867     {
3868         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3869     }
3870     else
3871     {
3872         tcm[YY][XX] = 0;
3873     }
3874     if (ZZ < npbcdim)
3875     {
3876         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3877         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3878     }
3879     else
3880     {
3881         tcm[ZZ][XX] = 0;
3882         tcm[ZZ][YY] = 0;
3883     }
3884 }
3885
3886 static void check_screw_box(matrix box)
3887 {
3888     /* Mathematical limitation */
3889     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3890     {
3891         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3892     }
3893
3894     /* Limitation due to the asymmetry of the eighth shell method */
3895     if (box[ZZ][YY] != 0)
3896     {
3897         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3898     }
3899 }
3900
3901 static void distribute_cg(FILE *fplog, gmx_int64_t step,
3902                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3903                           gmx_domdec_t *dd)
3904 {
3905     gmx_domdec_master_t *ma;
3906     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3907     int                  i, icg, j, k, k0, k1, d, npbcdim;
3908     matrix               tcm;
3909     rvec                 box_size, cg_cm;
3910     ivec                 ind;
3911     real                 nrcg, inv_ncg, pos_d;
3912     atom_id             *cgindex;
3913     gmx_bool             bUnbounded, bScrew;
3914
3915     ma = dd->ma;
3916
3917     if (tmp_ind == NULL)
3918     {
3919         snew(tmp_nalloc, dd->nnodes);
3920         snew(tmp_ind, dd->nnodes);
3921         for (i = 0; i < dd->nnodes; i++)
3922         {
3923             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3924             snew(tmp_ind[i], tmp_nalloc[i]);
3925         }
3926     }
3927
3928     /* Clear the count */
3929     for (i = 0; i < dd->nnodes; i++)
3930     {
3931         ma->ncg[i] = 0;
3932         ma->nat[i] = 0;
3933     }
3934
3935     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3936
3937     cgindex = cgs->index;
3938
3939     /* Compute the center of geometry for all charge groups */
3940     for (icg = 0; icg < cgs->nr; icg++)
3941     {
3942         k0      = cgindex[icg];
3943         k1      = cgindex[icg+1];
3944         nrcg    = k1 - k0;
3945         if (nrcg == 1)
3946         {
3947             copy_rvec(pos[k0], cg_cm);
3948         }
3949         else
3950         {
3951             inv_ncg = 1.0/nrcg;
3952
3953             clear_rvec(cg_cm);
3954             for (k = k0; (k < k1); k++)
3955             {
3956                 rvec_inc(cg_cm, pos[k]);
3957             }
3958             for (d = 0; (d < DIM); d++)
3959             {
3960                 cg_cm[d] *= inv_ncg;
3961             }
3962         }
3963         /* Put the charge group in the box and determine the cell index */
3964         for (d = DIM-1; d >= 0; d--)
3965         {
3966             pos_d = cg_cm[d];
3967             if (d < dd->npbcdim)
3968             {
3969                 bScrew = (dd->bScrewPBC && d == XX);
3970                 if (tric_dir[d] && dd->nc[d] > 1)
3971                 {
3972                     /* Use triclinic coordintates for this dimension */
3973                     for (j = d+1; j < DIM; j++)
3974                     {
3975                         pos_d += cg_cm[j]*tcm[j][d];
3976                     }
3977                 }
3978                 while (pos_d >= box[d][d])
3979                 {
3980                     pos_d -= box[d][d];
3981                     rvec_dec(cg_cm, box[d]);
3982                     if (bScrew)
3983                     {
3984                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3985                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3986                     }
3987                     for (k = k0; (k < k1); k++)
3988                     {
3989                         rvec_dec(pos[k], box[d]);
3990                         if (bScrew)
3991                         {
3992                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3993                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3994                         }
3995                     }
3996                 }
3997                 while (pos_d < 0)
3998                 {
3999                     pos_d += box[d][d];
4000                     rvec_inc(cg_cm, box[d]);
4001                     if (bScrew)
4002                     {
4003                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4004                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4005                     }
4006                     for (k = k0; (k < k1); k++)
4007                     {
4008                         rvec_inc(pos[k], box[d]);
4009                         if (bScrew)
4010                         {
4011                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4012                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4013                         }
4014                     }
4015                 }
4016             }
4017             /* This could be done more efficiently */
4018             ind[d] = 0;
4019             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4020             {
4021                 ind[d]++;
4022             }
4023         }
4024         i = dd_index(dd->nc, ind);
4025         if (ma->ncg[i] == tmp_nalloc[i])
4026         {
4027             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4028             srenew(tmp_ind[i], tmp_nalloc[i]);
4029         }
4030         tmp_ind[i][ma->ncg[i]] = icg;
4031         ma->ncg[i]++;
4032         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4033     }
4034
4035     k1 = 0;
4036     for (i = 0; i < dd->nnodes; i++)
4037     {
4038         ma->index[i] = k1;
4039         for (k = 0; k < ma->ncg[i]; k++)
4040         {
4041             ma->cg[k1++] = tmp_ind[i][k];
4042         }
4043     }
4044     ma->index[dd->nnodes] = k1;
4045
4046     for (i = 0; i < dd->nnodes; i++)
4047     {
4048         sfree(tmp_ind[i]);
4049     }
4050     sfree(tmp_ind);
4051     sfree(tmp_nalloc);
4052
4053     if (fplog)
4054     {
4055         char buf[22];
4056         fprintf(fplog, "Charge group distribution at step %s:",
4057                 gmx_step_str(step, buf));
4058         for (i = 0; i < dd->nnodes; i++)
4059         {
4060             fprintf(fplog, " %d", ma->ncg[i]);
4061         }
4062         fprintf(fplog, "\n");
4063     }
4064 }
4065
4066 static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
4067                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4068                                 rvec pos[])
4069 {
4070     gmx_domdec_master_t *ma = NULL;
4071     ivec                 npulse;
4072     int                  i, cg_gl;
4073     int                 *ibuf, buf2[2] = { 0, 0 };
4074     gmx_bool             bMaster = DDMASTER(dd);
4075
4076     if (bMaster)
4077     {
4078         ma = dd->ma;
4079
4080         if (dd->bScrewPBC)
4081         {
4082             check_screw_box(box);
4083         }
4084
4085         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
4086
4087         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4088         for (i = 0; i < dd->nnodes; i++)
4089         {
4090             ma->ibuf[2*i]   = ma->ncg[i];
4091             ma->ibuf[2*i+1] = ma->nat[i];
4092         }
4093         ibuf = ma->ibuf;
4094     }
4095     else
4096     {
4097         ibuf = NULL;
4098     }
4099     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4100
4101     dd->ncg_home = buf2[0];
4102     dd->nat_home = buf2[1];
4103     dd->ncg_tot  = dd->ncg_home;
4104     dd->nat_tot  = dd->nat_home;
4105     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4106     {
4107         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4108         srenew(dd->index_gl, dd->cg_nalloc);
4109         srenew(dd->cgindex, dd->cg_nalloc+1);
4110     }
4111     if (bMaster)
4112     {
4113         for (i = 0; i < dd->nnodes; i++)
4114         {
4115             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4116             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4117         }
4118     }
4119
4120     dd_scatterv(dd,
4121                 DDMASTER(dd) ? ma->ibuf : NULL,
4122                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4123                 DDMASTER(dd) ? ma->cg : NULL,
4124                 dd->ncg_home*sizeof(int), dd->index_gl);
4125
4126     /* Determine the home charge group sizes */
4127     dd->cgindex[0] = 0;
4128     for (i = 0; i < dd->ncg_home; i++)
4129     {
4130         cg_gl            = dd->index_gl[i];
4131         dd->cgindex[i+1] =
4132             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4133     }
4134
4135     if (debug)
4136     {
4137         fprintf(debug, "Home charge groups:\n");
4138         for (i = 0; i < dd->ncg_home; i++)
4139         {
4140             fprintf(debug, " %d", dd->index_gl[i]);
4141             if (i % 10 == 9)
4142             {
4143                 fprintf(debug, "\n");
4144             }
4145         }
4146         fprintf(debug, "\n");
4147     }
4148 }
4149
4150 static int compact_and_copy_vec_at(int ncg, int *move,
4151                                    int *cgindex,
4152                                    int nvec, int vec,
4153                                    rvec *src, gmx_domdec_comm_t *comm,
4154                                    gmx_bool bCompact)
4155 {
4156     int m, icg, i, i0, i1, nrcg;
4157     int home_pos;
4158     int pos_vec[DIM*2];
4159
4160     home_pos = 0;
4161
4162     for (m = 0; m < DIM*2; m++)
4163     {
4164         pos_vec[m] = 0;
4165     }
4166
4167     i0 = 0;
4168     for (icg = 0; icg < ncg; icg++)
4169     {
4170         i1 = cgindex[icg+1];
4171         m  = move[icg];
4172         if (m == -1)
4173         {
4174             if (bCompact)
4175             {
4176                 /* Compact the home array in place */
4177                 for (i = i0; i < i1; i++)
4178                 {
4179                     copy_rvec(src[i], src[home_pos++]);
4180                 }
4181             }
4182         }
4183         else
4184         {
4185             /* Copy to the communication buffer */
4186             nrcg        = i1 - i0;
4187             pos_vec[m] += 1 + vec*nrcg;
4188             for (i = i0; i < i1; i++)
4189             {
4190                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4191             }
4192             pos_vec[m] += (nvec - vec - 1)*nrcg;
4193         }
4194         if (!bCompact)
4195         {
4196             home_pos += i1 - i0;
4197         }
4198         i0 = i1;
4199     }
4200
4201     return home_pos;
4202 }
4203
4204 static int compact_and_copy_vec_cg(int ncg, int *move,
4205                                    int *cgindex,
4206                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4207                                    gmx_bool bCompact)
4208 {
4209     int m, icg, i0, i1, nrcg;
4210     int home_pos;
4211     int pos_vec[DIM*2];
4212
4213     home_pos = 0;
4214
4215     for (m = 0; m < DIM*2; m++)
4216     {
4217         pos_vec[m] = 0;
4218     }
4219
4220     i0 = 0;
4221     for (icg = 0; icg < ncg; icg++)
4222     {
4223         i1 = cgindex[icg+1];
4224         m  = move[icg];
4225         if (m == -1)
4226         {
4227             if (bCompact)
4228             {
4229                 /* Compact the home array in place */
4230                 copy_rvec(src[icg], src[home_pos++]);
4231             }
4232         }
4233         else
4234         {
4235             nrcg = i1 - i0;
4236             /* Copy to the communication buffer */
4237             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4238             pos_vec[m] += 1 + nrcg*nvec;
4239         }
4240         i0 = i1;
4241     }
4242     if (!bCompact)
4243     {
4244         home_pos = ncg;
4245     }
4246
4247     return home_pos;
4248 }
4249
4250 static int compact_ind(int ncg, int *move,
4251                        int *index_gl, int *cgindex,
4252                        int *gatindex,
4253                        gmx_ga2la_t ga2la, char *bLocalCG,
4254                        int *cginfo)
4255 {
4256     int cg, nat, a0, a1, a, a_gl;
4257     int home_pos;
4258
4259     home_pos = 0;
4260     nat      = 0;
4261     for (cg = 0; cg < ncg; cg++)
4262     {
4263         a0 = cgindex[cg];
4264         a1 = cgindex[cg+1];
4265         if (move[cg] == -1)
4266         {
4267             /* Compact the home arrays in place.
4268              * Anything that can be done here avoids access to global arrays.
4269              */
4270             cgindex[home_pos] = nat;
4271             for (a = a0; a < a1; a++)
4272             {
4273                 a_gl          = gatindex[a];
4274                 gatindex[nat] = a_gl;
4275                 /* The cell number stays 0, so we don't need to set it */
4276                 ga2la_change_la(ga2la, a_gl, nat);
4277                 nat++;
4278             }
4279             index_gl[home_pos] = index_gl[cg];
4280             cginfo[home_pos]   = cginfo[cg];
4281             /* The charge group remains local, so bLocalCG does not change */
4282             home_pos++;
4283         }
4284         else
4285         {
4286             /* Clear the global indices */
4287             for (a = a0; a < a1; a++)
4288             {
4289                 ga2la_del(ga2la, gatindex[a]);
4290             }
4291             if (bLocalCG)
4292             {
4293                 bLocalCG[index_gl[cg]] = FALSE;
4294             }
4295         }
4296     }
4297     cgindex[home_pos] = nat;
4298
4299     return home_pos;
4300 }
4301
4302 static void clear_and_mark_ind(int ncg, int *move,
4303                                int *index_gl, int *cgindex, int *gatindex,
4304                                gmx_ga2la_t ga2la, char *bLocalCG,
4305                                int *cell_index)
4306 {
4307     int cg, a0, a1, a;
4308
4309     for (cg = 0; cg < ncg; cg++)
4310     {
4311         if (move[cg] >= 0)
4312         {
4313             a0 = cgindex[cg];
4314             a1 = cgindex[cg+1];
4315             /* Clear the global indices */
4316             for (a = a0; a < a1; a++)
4317             {
4318                 ga2la_del(ga2la, gatindex[a]);
4319             }
4320             if (bLocalCG)
4321             {
4322                 bLocalCG[index_gl[cg]] = FALSE;
4323             }
4324             /* Signal that this cg has moved using the ns cell index.
4325              * Here we set it to -1. fill_grid will change it
4326              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4327              */
4328             cell_index[cg] = -1;
4329         }
4330     }
4331 }
4332
4333 static void print_cg_move(FILE *fplog,
4334                           gmx_domdec_t *dd,
4335                           gmx_int64_t step, int cg, int dim, int dir,
4336                           gmx_bool bHaveCgcmOld, real limitd,
4337                           rvec cm_old, rvec cm_new, real pos_d)
4338 {
4339     gmx_domdec_comm_t *comm;
4340     char               buf[22];
4341
4342     comm = dd->comm;
4343
4344     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4345     if (limitd > 0)
4346     {
4347         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4348                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4349                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4350     }
4351     else
4352     {
4353         /* We don't have a limiting distance available: don't print it */
4354         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4355                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4356                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4357     }
4358     fprintf(fplog, "distance out of cell %f\n",
4359             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4360     if (bHaveCgcmOld)
4361     {
4362         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4363                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4364     }
4365     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4366             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4367     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4368             dim2char(dim),
4369             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4370     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4371             dim2char(dim),
4372             comm->cell_x0[dim], comm->cell_x1[dim]);
4373 }
4374
4375 static void cg_move_error(FILE *fplog,
4376                           gmx_domdec_t *dd,
4377                           gmx_int64_t step, int cg, int dim, int dir,
4378                           gmx_bool bHaveCgcmOld, real limitd,
4379                           rvec cm_old, rvec cm_new, real pos_d)
4380 {
4381     if (fplog)
4382     {
4383         print_cg_move(fplog, dd, step, cg, dim, dir,
4384                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4385     }
4386     print_cg_move(stderr, dd, step, cg, dim, dir,
4387                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4388     gmx_fatal(FARGS,
4389               "%s moved too far between two domain decomposition steps\n"
4390               "This usually means that your system is not well equilibrated",
4391               dd->comm->bCGs ? "A charge group" : "An atom");
4392 }
4393
4394 static void rotate_state_atom(t_state *state, int a)
4395 {
4396     int est;
4397
4398     for (est = 0; est < estNR; est++)
4399     {
4400         if (EST_DISTR(est) && (state->flags & (1<<est)))
4401         {
4402             switch (est)
4403             {
4404                 case estX:
4405                     /* Rotate the complete state; for a rectangular box only */
4406                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4407                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4408                     break;
4409                 case estV:
4410                     state->v[a][YY] = -state->v[a][YY];
4411                     state->v[a][ZZ] = -state->v[a][ZZ];
4412                     break;
4413                 case estSDX:
4414                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4415                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4416                     break;
4417                 case estCGP:
4418                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4419                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4420                     break;
4421                 case estDISRE_INITF:
4422                 case estDISRE_RM3TAV:
4423                 case estORIRE_INITF:
4424                 case estORIRE_DTAV:
4425                     /* These are distances, so not affected by rotation */
4426                     break;
4427                 default:
4428                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4429             }
4430         }
4431     }
4432 }
4433
4434 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4435 {
4436     if (natoms > comm->moved_nalloc)
4437     {
4438         /* Contents should be preserved here */
4439         comm->moved_nalloc = over_alloc_dd(natoms);
4440         srenew(comm->moved, comm->moved_nalloc);
4441     }
4442
4443     return comm->moved;
4444 }
4445
4446 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4447                          gmx_domdec_t *dd,
4448                          t_state *state,
4449                          ivec tric_dir, matrix tcm,
4450                          rvec cell_x0, rvec cell_x1,
4451                          rvec limitd, rvec limit0, rvec limit1,
4452                          const int *cgindex,
4453                          int cg_start, int cg_end,
4454                          rvec *cg_cm,
4455                          int *move)
4456 {
4457     int      npbcdim;
4458     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4459     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4460     int      flag;
4461     gmx_bool bScrew;
4462     ivec     dev;
4463     real     inv_ncg, pos_d;
4464     rvec     cm_new;
4465
4466     npbcdim = dd->npbcdim;
4467
4468     for (cg = cg_start; cg < cg_end; cg++)
4469     {
4470         k0   = cgindex[cg];
4471         k1   = cgindex[cg+1];
4472         nrcg = k1 - k0;
4473         if (nrcg == 1)
4474         {
4475             copy_rvec(state->x[k0], cm_new);
4476         }
4477         else
4478         {
4479             inv_ncg = 1.0/nrcg;
4480
4481             clear_rvec(cm_new);
4482             for (k = k0; (k < k1); k++)
4483             {
4484                 rvec_inc(cm_new, state->x[k]);
4485             }
4486             for (d = 0; (d < DIM); d++)
4487             {
4488                 cm_new[d] = inv_ncg*cm_new[d];
4489             }
4490         }
4491
4492         clear_ivec(dev);
4493         /* Do pbc and check DD cell boundary crossings */
4494         for (d = DIM-1; d >= 0; d--)
4495         {
4496             if (dd->nc[d] > 1)
4497             {
4498                 bScrew = (dd->bScrewPBC && d == XX);
4499                 /* Determine the location of this cg in lattice coordinates */
4500                 pos_d = cm_new[d];
4501                 if (tric_dir[d])
4502                 {
4503                     for (d2 = d+1; d2 < DIM; d2++)
4504                     {
4505                         pos_d += cm_new[d2]*tcm[d2][d];
4506                     }
4507                 }
4508                 /* Put the charge group in the triclinic unit-cell */
4509                 if (pos_d >= cell_x1[d])
4510                 {
4511                     if (pos_d >= limit1[d])
4512                     {
4513                         cg_move_error(fplog, dd, step, cg, d, 1,
4514                                       cg_cm != state->x, limitd[d],
4515                                       cg_cm[cg], cm_new, pos_d);
4516                     }
4517                     dev[d] = 1;
4518                     if (dd->ci[d] == dd->nc[d] - 1)
4519                     {
4520                         rvec_dec(cm_new, state->box[d]);
4521                         if (bScrew)
4522                         {
4523                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4524                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4525                         }
4526                         for (k = k0; (k < k1); k++)
4527                         {
4528                             rvec_dec(state->x[k], state->box[d]);
4529                             if (bScrew)
4530                             {
4531                                 rotate_state_atom(state, k);
4532                             }
4533                         }
4534                     }
4535                 }
4536                 else if (pos_d < cell_x0[d])
4537                 {
4538                     if (pos_d < limit0[d])
4539                     {
4540                         cg_move_error(fplog, dd, step, cg, d, -1,
4541                                       cg_cm != state->x, limitd[d],
4542                                       cg_cm[cg], cm_new, pos_d);
4543                     }
4544                     dev[d] = -1;
4545                     if (dd->ci[d] == 0)
4546                     {
4547                         rvec_inc(cm_new, state->box[d]);
4548                         if (bScrew)
4549                         {
4550                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4551                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4552                         }
4553                         for (k = k0; (k < k1); k++)
4554                         {
4555                             rvec_inc(state->x[k], state->box[d]);
4556                             if (bScrew)
4557                             {
4558                                 rotate_state_atom(state, k);
4559                             }
4560                         }
4561                     }
4562                 }
4563             }
4564             else if (d < npbcdim)
4565             {
4566                 /* Put the charge group in the rectangular unit-cell */
4567                 while (cm_new[d] >= state->box[d][d])
4568                 {
4569                     rvec_dec(cm_new, state->box[d]);
4570                     for (k = k0; (k < k1); k++)
4571                     {
4572                         rvec_dec(state->x[k], state->box[d]);
4573                     }
4574                 }
4575                 while (cm_new[d] < 0)
4576                 {
4577                     rvec_inc(cm_new, state->box[d]);
4578                     for (k = k0; (k < k1); k++)
4579                     {
4580                         rvec_inc(state->x[k], state->box[d]);
4581                     }
4582                 }
4583             }
4584         }
4585
4586         copy_rvec(cm_new, cg_cm[cg]);
4587
4588         /* Determine where this cg should go */
4589         flag = 0;
4590         mc   = -1;
4591         for (d = 0; d < dd->ndim; d++)
4592         {
4593             dim = dd->dim[d];
4594             if (dev[dim] == 1)
4595             {
4596                 flag |= DD_FLAG_FW(d);
4597                 if (mc == -1)
4598                 {
4599                     mc = d*2;
4600                 }
4601             }
4602             else if (dev[dim] == -1)
4603             {
4604                 flag |= DD_FLAG_BW(d);
4605                 if (mc == -1)
4606                 {
4607                     if (dd->nc[dim] > 2)
4608                     {
4609                         mc = d*2 + 1;
4610                     }
4611                     else
4612                     {
4613                         mc = d*2;
4614                     }
4615                 }
4616             }
4617         }
4618         /* Temporarily store the flag in move */
4619         move[cg] = mc + flag;
4620     }
4621 }
4622
4623 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4624                                gmx_domdec_t *dd, ivec tric_dir,
4625                                t_state *state, rvec **f,
4626                                t_forcerec *fr,
4627                                gmx_bool bCompact,
4628                                t_nrnb *nrnb,
4629                                int *ncg_stay_home,
4630                                int *ncg_moved)
4631 {
4632     int               *move;
4633     int                npbcdim;
4634     int                ncg[DIM*2], nat[DIM*2];
4635     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4636     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4637     int                sbuf[2], rbuf[2];
4638     int                home_pos_cg, home_pos_at, buf_pos;
4639     int                flag;
4640     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4641     gmx_bool           bScrew;
4642     ivec               dev;
4643     real               inv_ncg, pos_d;
4644     matrix             tcm;
4645     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4646     atom_id           *cgindex;
4647     cginfo_mb_t       *cginfo_mb;
4648     gmx_domdec_comm_t *comm;
4649     int               *moved;
4650     int                nthread, thread;
4651
4652     if (dd->bScrewPBC)
4653     {
4654         check_screw_box(state->box);
4655     }
4656
4657     comm  = dd->comm;
4658     if (fr->cutoff_scheme == ecutsGROUP)
4659     {
4660         cg_cm = fr->cg_cm;
4661     }
4662
4663     for (i = 0; i < estNR; i++)
4664     {
4665         if (EST_DISTR(i))
4666         {
4667             switch (i)
4668             {
4669                 case estX: /* Always present */ break;
4670                 case estV:   bV   = (state->flags & (1<<i)); break;
4671                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4672                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4673                 case estLD_RNG:
4674                 case estLD_RNGI:
4675                 case estDISRE_INITF:
4676                 case estDISRE_RM3TAV:
4677                 case estORIRE_INITF:
4678                 case estORIRE_DTAV:
4679                     /* No processing required */
4680                     break;
4681                 default:
4682                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4683             }
4684         }
4685     }
4686
4687     if (dd->ncg_tot > comm->nalloc_int)
4688     {
4689         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4690         srenew(comm->buf_int, comm->nalloc_int);
4691     }
4692     move = comm->buf_int;
4693
4694     /* Clear the count */
4695     for (c = 0; c < dd->ndim*2; c++)
4696     {
4697         ncg[c] = 0;
4698         nat[c] = 0;
4699     }
4700
4701     npbcdim = dd->npbcdim;
4702
4703     for (d = 0; (d < DIM); d++)
4704     {
4705         limitd[d] = dd->comm->cellsize_min[d];
4706         if (d >= npbcdim && dd->ci[d] == 0)
4707         {
4708             cell_x0[d] = -GMX_FLOAT_MAX;
4709         }
4710         else
4711         {
4712             cell_x0[d] = comm->cell_x0[d];
4713         }
4714         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4715         {
4716             cell_x1[d] = GMX_FLOAT_MAX;
4717         }
4718         else
4719         {
4720             cell_x1[d] = comm->cell_x1[d];
4721         }
4722         if (d < npbcdim)
4723         {
4724             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4725             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4726         }
4727         else
4728         {
4729             /* We check after communication if a charge group moved
4730              * more than one cell. Set the pre-comm check limit to float_max.
4731              */
4732             limit0[d] = -GMX_FLOAT_MAX;
4733             limit1[d] =  GMX_FLOAT_MAX;
4734         }
4735     }
4736
4737     make_tric_corr_matrix(npbcdim, state->box, tcm);
4738
4739     cgindex = dd->cgindex;
4740
4741     nthread = gmx_omp_nthreads_get(emntDomdec);
4742
4743     /* Compute the center of geometry for all home charge groups
4744      * and put them in the box and determine where they should go.
4745      */
4746 #pragma omp parallel for num_threads(nthread) schedule(static)
4747     for (thread = 0; thread < nthread; thread++)
4748     {
4749         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4750                      cell_x0, cell_x1, limitd, limit0, limit1,
4751                      cgindex,
4752                      ( thread   *dd->ncg_home)/nthread,
4753                      ((thread+1)*dd->ncg_home)/nthread,
4754                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4755                      move);
4756     }
4757
4758     for (cg = 0; cg < dd->ncg_home; cg++)
4759     {
4760         if (move[cg] >= 0)
4761         {
4762             mc       = move[cg];
4763             flag     = mc & ~DD_FLAG_NRCG;
4764             mc       = mc & DD_FLAG_NRCG;
4765             move[cg] = mc;
4766
4767             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4768             {
4769                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4770                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4771             }
4772             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4773             /* We store the cg size in the lower 16 bits
4774              * and the place where the charge group should go
4775              * in the next 6 bits. This saves some communication volume.
4776              */
4777             nrcg = cgindex[cg+1] - cgindex[cg];
4778             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4779             ncg[mc] += 1;
4780             nat[mc] += nrcg;
4781         }
4782     }
4783
4784     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4785     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4786
4787     *ncg_moved = 0;
4788     for (i = 0; i < dd->ndim*2; i++)
4789     {
4790         *ncg_moved += ncg[i];
4791     }
4792
4793     nvec = 1;
4794     if (bV)
4795     {
4796         nvec++;
4797     }
4798     if (bSDX)
4799     {
4800         nvec++;
4801     }
4802     if (bCGP)
4803     {
4804         nvec++;
4805     }
4806
4807     /* Make sure the communication buffers are large enough */
4808     for (mc = 0; mc < dd->ndim*2; mc++)
4809     {
4810         nvr = ncg[mc] + nat[mc]*nvec;
4811         if (nvr > comm->cgcm_state_nalloc[mc])
4812         {
4813             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4814             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4815         }
4816     }
4817
4818     switch (fr->cutoff_scheme)
4819     {
4820         case ecutsGROUP:
4821             /* Recalculating cg_cm might be cheaper than communicating,
4822              * but that could give rise to rounding issues.
4823              */
4824             home_pos_cg =
4825                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4826                                         nvec, cg_cm, comm, bCompact);
4827             break;
4828         case ecutsVERLET:
4829             /* Without charge groups we send the moved atom coordinates
4830              * over twice. This is so the code below can be used without
4831              * many conditionals for both for with and without charge groups.
4832              */
4833             home_pos_cg =
4834                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4835                                         nvec, state->x, comm, FALSE);
4836             if (bCompact)
4837             {
4838                 home_pos_cg -= *ncg_moved;
4839             }
4840             break;
4841         default:
4842             gmx_incons("unimplemented");
4843             home_pos_cg = 0;
4844     }
4845
4846     vec         = 0;
4847     home_pos_at =
4848         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4849                                 nvec, vec++, state->x, comm, bCompact);
4850     if (bV)
4851     {
4852         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4853                                 nvec, vec++, state->v, comm, bCompact);
4854     }
4855     if (bSDX)
4856     {
4857         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4858                                 nvec, vec++, state->sd_X, comm, bCompact);
4859     }
4860     if (bCGP)
4861     {
4862         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4863                                 nvec, vec++, state->cg_p, comm, bCompact);
4864     }
4865
4866     if (bCompact)
4867     {
4868         compact_ind(dd->ncg_home, move,
4869                     dd->index_gl, dd->cgindex, dd->gatindex,
4870                     dd->ga2la, comm->bLocalCG,
4871                     fr->cginfo);
4872     }
4873     else
4874     {
4875         if (fr->cutoff_scheme == ecutsVERLET)
4876         {
4877             moved = get_moved(comm, dd->ncg_home);
4878
4879             for (k = 0; k < dd->ncg_home; k++)
4880             {
4881                 moved[k] = 0;
4882             }
4883         }
4884         else
4885         {
4886             moved = fr->ns.grid->cell_index;
4887         }
4888
4889         clear_and_mark_ind(dd->ncg_home, move,
4890                            dd->index_gl, dd->cgindex, dd->gatindex,
4891                            dd->ga2la, comm->bLocalCG,
4892                            moved);
4893     }
4894
4895     cginfo_mb = fr->cginfo_mb;
4896
4897     *ncg_stay_home = home_pos_cg;
4898     for (d = 0; d < dd->ndim; d++)
4899     {
4900         dim      = dd->dim[d];
4901         ncg_recv = 0;
4902         nat_recv = 0;
4903         nvr      = 0;
4904         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4905         {
4906             cdd = d*2 + dir;
4907             /* Communicate the cg and atom counts */
4908             sbuf[0] = ncg[cdd];
4909             sbuf[1] = nat[cdd];
4910             if (debug)
4911             {
4912                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4913                         d, dir, sbuf[0], sbuf[1]);
4914             }
4915             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4916
4917             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4918             {
4919                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4920                 srenew(comm->buf_int, comm->nalloc_int);
4921             }
4922
4923             /* Communicate the charge group indices, sizes and flags */
4924             dd_sendrecv_int(dd, d, dir,
4925                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4926                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4927
4928             nvs = ncg[cdd] + nat[cdd]*nvec;
4929             i   = rbuf[0]  + rbuf[1] *nvec;
4930             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4931
4932             /* Communicate cgcm and state */
4933             dd_sendrecv_rvec(dd, d, dir,
4934                              comm->cgcm_state[cdd], nvs,
4935                              comm->vbuf.v+nvr, i);
4936             ncg_recv += rbuf[0];
4937             nat_recv += rbuf[1];
4938             nvr      += i;
4939         }
4940
4941         /* Process the received charge groups */
4942         buf_pos = 0;
4943         for (cg = 0; cg < ncg_recv; cg++)
4944         {
4945             flag = comm->buf_int[cg*DD_CGIBS+1];
4946
4947             if (dim >= npbcdim && dd->nc[dim] > 2)
4948             {
4949                 /* No pbc in this dim and more than one domain boundary.
4950                  * We do a separate check if a charge group didn't move too far.
4951                  */
4952                 if (((flag & DD_FLAG_FW(d)) &&
4953                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4954                     ((flag & DD_FLAG_BW(d)) &&
4955                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4956                 {
4957                     cg_move_error(fplog, dd, step, cg, dim,
4958                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4959                                   fr->cutoff_scheme == ecutsGROUP, 0,
4960                                   comm->vbuf.v[buf_pos],
4961                                   comm->vbuf.v[buf_pos],
4962                                   comm->vbuf.v[buf_pos][dim]);
4963                 }
4964             }
4965
4966             mc = -1;
4967             if (d < dd->ndim-1)
4968             {
4969                 /* Check which direction this cg should go */
4970                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4971                 {
4972                     if (dd->bGridJump)
4973                     {
4974                         /* The cell boundaries for dimension d2 are not equal
4975                          * for each cell row of the lower dimension(s),
4976                          * therefore we might need to redetermine where
4977                          * this cg should go.
4978                          */
4979                         dim2 = dd->dim[d2];
4980                         /* If this cg crosses the box boundary in dimension d2
4981                          * we can use the communicated flag, so we do not
4982                          * have to worry about pbc.
4983                          */
4984                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4985                                (flag & DD_FLAG_FW(d2))) ||
4986                               (dd->ci[dim2] == 0 &&
4987                                (flag & DD_FLAG_BW(d2)))))
4988                         {
4989                             /* Clear the two flags for this dimension */
4990                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4991                             /* Determine the location of this cg
4992                              * in lattice coordinates
4993                              */
4994                             pos_d = comm->vbuf.v[buf_pos][dim2];
4995                             if (tric_dir[dim2])
4996                             {
4997                                 for (d3 = dim2+1; d3 < DIM; d3++)
4998                                 {
4999                                     pos_d +=
5000                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
5001                                 }
5002                             }
5003                             /* Check of we are not at the box edge.
5004                              * pbc is only handled in the first step above,
5005                              * but this check could move over pbc while
5006                              * the first step did not due to different rounding.
5007                              */
5008                             if (pos_d >= cell_x1[dim2] &&
5009                                 dd->ci[dim2] != dd->nc[dim2]-1)
5010                             {
5011                                 flag |= DD_FLAG_FW(d2);
5012                             }
5013                             else if (pos_d < cell_x0[dim2] &&
5014                                      dd->ci[dim2] != 0)
5015                             {
5016                                 flag |= DD_FLAG_BW(d2);
5017                             }
5018                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5019                         }
5020                     }
5021                     /* Set to which neighboring cell this cg should go */
5022                     if (flag & DD_FLAG_FW(d2))
5023                     {
5024                         mc = d2*2;
5025                     }
5026                     else if (flag & DD_FLAG_BW(d2))
5027                     {
5028                         if (dd->nc[dd->dim[d2]] > 2)
5029                         {
5030                             mc = d2*2+1;
5031                         }
5032                         else
5033                         {
5034                             mc = d2*2;
5035                         }
5036                     }
5037                 }
5038             }
5039
5040             nrcg = flag & DD_FLAG_NRCG;
5041             if (mc == -1)
5042             {
5043                 if (home_pos_cg+1 > dd->cg_nalloc)
5044                 {
5045                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5046                     srenew(dd->index_gl, dd->cg_nalloc);
5047                     srenew(dd->cgindex, dd->cg_nalloc+1);
5048                 }
5049                 /* Set the global charge group index and size */
5050                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5051                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5052                 /* Copy the state from the buffer */
5053                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5054                 if (fr->cutoff_scheme == ecutsGROUP)
5055                 {
5056                     cg_cm = fr->cg_cm;
5057                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5058                 }
5059                 buf_pos++;
5060
5061                 /* Set the cginfo */
5062                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5063                                                    dd->index_gl[home_pos_cg]);
5064                 if (comm->bLocalCG)
5065                 {
5066                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5067                 }
5068
5069                 if (home_pos_at+nrcg > state->nalloc)
5070                 {
5071                     dd_realloc_state(state, f, home_pos_at+nrcg);
5072                 }
5073                 for (i = 0; i < nrcg; i++)
5074                 {
5075                     copy_rvec(comm->vbuf.v[buf_pos++],
5076                               state->x[home_pos_at+i]);
5077                 }
5078                 if (bV)
5079                 {
5080                     for (i = 0; i < nrcg; i++)
5081                     {
5082                         copy_rvec(comm->vbuf.v[buf_pos++],
5083                                   state->v[home_pos_at+i]);
5084                     }
5085                 }
5086                 if (bSDX)
5087                 {
5088                     for (i = 0; i < nrcg; i++)
5089                     {
5090                         copy_rvec(comm->vbuf.v[buf_pos++],
5091                                   state->sd_X[home_pos_at+i]);
5092                     }
5093                 }
5094                 if (bCGP)
5095                 {
5096                     for (i = 0; i < nrcg; i++)
5097                     {
5098                         copy_rvec(comm->vbuf.v[buf_pos++],
5099                                   state->cg_p[home_pos_at+i]);
5100                     }
5101                 }
5102                 home_pos_cg += 1;
5103                 home_pos_at += nrcg;
5104             }
5105             else
5106             {
5107                 /* Reallocate the buffers if necessary  */
5108                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5109                 {
5110                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5111                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5112                 }
5113                 nvr = ncg[mc] + nat[mc]*nvec;
5114                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5115                 {
5116                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5117                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5118                 }
5119                 /* Copy from the receive to the send buffers */
5120                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5121                        comm->buf_int + cg*DD_CGIBS,
5122                        DD_CGIBS*sizeof(int));
5123                 memcpy(comm->cgcm_state[mc][nvr],
5124                        comm->vbuf.v[buf_pos],
5125                        (1+nrcg*nvec)*sizeof(rvec));
5126                 buf_pos += 1 + nrcg*nvec;
5127                 ncg[mc] += 1;
5128                 nat[mc] += nrcg;
5129             }
5130         }
5131     }
5132
5133     /* With sorting (!bCompact) the indices are now only partially up to date
5134      * and ncg_home and nat_home are not the real count, since there are
5135      * "holes" in the arrays for the charge groups that moved to neighbors.
5136      */
5137     if (fr->cutoff_scheme == ecutsVERLET)
5138     {
5139         moved = get_moved(comm, home_pos_cg);
5140
5141         for (i = dd->ncg_home; i < home_pos_cg; i++)
5142         {
5143             moved[i] = 0;
5144         }
5145     }
5146     dd->ncg_home = home_pos_cg;
5147     dd->nat_home = home_pos_at;
5148
5149     if (debug)
5150     {
5151         fprintf(debug,
5152                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5153                 *ncg_moved, dd->ncg_home-*ncg_moved);
5154
5155     }
5156 }
5157
5158 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5159 {
5160     dd->comm->cycl[ddCycl] += cycles;
5161     dd->comm->cycl_n[ddCycl]++;
5162     if (cycles > dd->comm->cycl_max[ddCycl])
5163     {
5164         dd->comm->cycl_max[ddCycl] = cycles;
5165     }
5166 }
5167
5168 static double force_flop_count(t_nrnb *nrnb)
5169 {
5170     int         i;
5171     double      sum;
5172     const char *name;
5173
5174     sum = 0;
5175     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5176     {
5177         /* To get closer to the real timings, we half the count
5178          * for the normal loops and again half it for water loops.
5179          */
5180         name = nrnb_str(i);
5181         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5182         {
5183             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5184         }
5185         else
5186         {
5187             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5188         }
5189     }
5190     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5191     {
5192         name = nrnb_str(i);
5193         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5194         {
5195             sum += nrnb->n[i]*cost_nrnb(i);
5196         }
5197     }
5198     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5199     {
5200         sum += nrnb->n[i]*cost_nrnb(i);
5201     }
5202
5203     return sum;
5204 }
5205
5206 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5207 {
5208     if (dd->comm->eFlop)
5209     {
5210         dd->comm->flop -= force_flop_count(nrnb);
5211     }
5212 }
5213 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5214 {
5215     if (dd->comm->eFlop)
5216     {
5217         dd->comm->flop += force_flop_count(nrnb);
5218         dd->comm->flop_n++;
5219     }
5220 }
5221
5222 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5223 {
5224     int i;
5225
5226     for (i = 0; i < ddCyclNr; i++)
5227     {
5228         dd->comm->cycl[i]     = 0;
5229         dd->comm->cycl_n[i]   = 0;
5230         dd->comm->cycl_max[i] = 0;
5231     }
5232     dd->comm->flop   = 0;
5233     dd->comm->flop_n = 0;
5234 }
5235
5236 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5237 {
5238     gmx_domdec_comm_t *comm;
5239     gmx_domdec_load_t *load;
5240     gmx_domdec_root_t *root = NULL;
5241     int                d, dim, cid, i, pos;
5242     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5243     gmx_bool           bSepPME;
5244
5245     if (debug)
5246     {
5247         fprintf(debug, "get_load_distribution start\n");
5248     }
5249
5250     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5251
5252     comm = dd->comm;
5253
5254     bSepPME = (dd->pme_nodeid >= 0);
5255
5256     for (d = dd->ndim-1; d >= 0; d--)
5257     {
5258         dim = dd->dim[d];
5259         /* Check if we participate in the communication in this dimension */
5260         if (d == dd->ndim-1 ||
5261             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5262         {
5263             load = &comm->load[d];
5264             if (dd->bGridJump)
5265             {
5266                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5267             }
5268             pos = 0;
5269             if (d == dd->ndim-1)
5270             {
5271                 sbuf[pos++] = dd_force_load(comm);
5272                 sbuf[pos++] = sbuf[0];
5273                 if (dd->bGridJump)
5274                 {
5275                     sbuf[pos++] = sbuf[0];
5276                     sbuf[pos++] = cell_frac;
5277                     if (d > 0)
5278                     {
5279                         sbuf[pos++] = comm->cell_f_max0[d];
5280                         sbuf[pos++] = comm->cell_f_min1[d];
5281                     }
5282                 }
5283                 if (bSepPME)
5284                 {
5285                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5286                     sbuf[pos++] = comm->cycl[ddCyclPME];
5287                 }
5288             }
5289             else
5290             {
5291                 sbuf[pos++] = comm->load[d+1].sum;
5292                 sbuf[pos++] = comm->load[d+1].max;
5293                 if (dd->bGridJump)
5294                 {
5295                     sbuf[pos++] = comm->load[d+1].sum_m;
5296                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5297                     sbuf[pos++] = comm->load[d+1].flags;
5298                     if (d > 0)
5299                     {
5300                         sbuf[pos++] = comm->cell_f_max0[d];
5301                         sbuf[pos++] = comm->cell_f_min1[d];
5302                     }
5303                 }
5304                 if (bSepPME)
5305                 {
5306                     sbuf[pos++] = comm->load[d+1].mdf;
5307                     sbuf[pos++] = comm->load[d+1].pme;
5308                 }
5309             }
5310             load->nload = pos;
5311             /* Communicate a row in DD direction d.
5312              * The communicators are setup such that the root always has rank 0.
5313              */
5314 #ifdef GMX_MPI
5315             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5316                        load->load, load->nload*sizeof(float), MPI_BYTE,
5317                        0, comm->mpi_comm_load[d]);
5318 #endif
5319             if (dd->ci[dim] == dd->master_ci[dim])
5320             {
5321                 /* We are the root, process this row */
5322                 if (comm->bDynLoadBal)
5323                 {
5324                     root = comm->root[d];
5325                 }
5326                 load->sum      = 0;
5327                 load->max      = 0;
5328                 load->sum_m    = 0;
5329                 load->cvol_min = 1;
5330                 load->flags    = 0;
5331                 load->mdf      = 0;
5332                 load->pme      = 0;
5333                 pos            = 0;
5334                 for (i = 0; i < dd->nc[dim]; i++)
5335                 {
5336                     load->sum += load->load[pos++];
5337                     load->max  = max(load->max, load->load[pos]);
5338                     pos++;
5339                     if (dd->bGridJump)
5340                     {
5341                         if (root->bLimited)
5342                         {
5343                             /* This direction could not be load balanced properly,
5344                              * therefore we need to use the maximum iso the average load.
5345                              */
5346                             load->sum_m = max(load->sum_m, load->load[pos]);
5347                         }
5348                         else
5349                         {
5350                             load->sum_m += load->load[pos];
5351                         }
5352                         pos++;
5353                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5354                         pos++;
5355                         if (d < dd->ndim-1)
5356                         {
5357                             load->flags = (int)(load->load[pos++] + 0.5);
5358                         }
5359                         if (d > 0)
5360                         {
5361                             root->cell_f_max0[i] = load->load[pos++];
5362                             root->cell_f_min1[i] = load->load[pos++];
5363                         }
5364                     }
5365                     if (bSepPME)
5366                     {
5367                         load->mdf = max(load->mdf, load->load[pos]);
5368                         pos++;
5369                         load->pme = max(load->pme, load->load[pos]);
5370                         pos++;
5371                     }
5372                 }
5373                 if (comm->bDynLoadBal && root->bLimited)
5374                 {
5375                     load->sum_m *= dd->nc[dim];
5376                     load->flags |= (1<<d);
5377                 }
5378             }
5379         }
5380     }
5381
5382     if (DDMASTER(dd))
5383     {
5384         comm->nload      += dd_load_count(comm);
5385         comm->load_step  += comm->cycl[ddCyclStep];
5386         comm->load_sum   += comm->load[0].sum;
5387         comm->load_max   += comm->load[0].max;
5388         if (comm->bDynLoadBal)
5389         {
5390             for (d = 0; d < dd->ndim; d++)
5391             {
5392                 if (comm->load[0].flags & (1<<d))
5393                 {
5394                     comm->load_lim[d]++;
5395                 }
5396             }
5397         }
5398         if (bSepPME)
5399         {
5400             comm->load_mdf += comm->load[0].mdf;
5401             comm->load_pme += comm->load[0].pme;
5402         }
5403     }
5404
5405     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5406
5407     if (debug)
5408     {
5409         fprintf(debug, "get_load_distribution finished\n");
5410     }
5411 }
5412
5413 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5414 {
5415     /* Return the relative performance loss on the total run time
5416      * due to the force calculation load imbalance.
5417      */
5418     if (dd->comm->nload > 0)
5419     {
5420         return
5421             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5422             (dd->comm->load_step*dd->nnodes);
5423     }
5424     else
5425     {
5426         return 0;
5427     }
5428 }
5429
5430 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5431 {
5432     char               buf[STRLEN];
5433     int                npp, npme, nnodes, d, limp;
5434     float              imbal, pme_f_ratio, lossf, lossp = 0;
5435     gmx_bool           bLim;
5436     gmx_domdec_comm_t *comm;
5437
5438     comm = dd->comm;
5439     if (DDMASTER(dd) && comm->nload > 0)
5440     {
5441         npp    = dd->nnodes;
5442         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5443         nnodes = npp + npme;
5444         imbal  = comm->load_max*npp/comm->load_sum - 1;
5445         lossf  = dd_force_imb_perf_loss(dd);
5446         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5447         fprintf(fplog, "%s", buf);
5448         fprintf(stderr, "\n");
5449         fprintf(stderr, "%s", buf);
5450         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5451         fprintf(fplog, "%s", buf);
5452         fprintf(stderr, "%s", buf);
5453         bLim = FALSE;
5454         if (comm->bDynLoadBal)
5455         {
5456             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5457             for (d = 0; d < dd->ndim; d++)
5458             {
5459                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5460                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5461                 if (limp >= 50)
5462                 {
5463                     bLim = TRUE;
5464                 }
5465             }
5466             sprintf(buf+strlen(buf), "\n");
5467             fprintf(fplog, "%s", buf);
5468             fprintf(stderr, "%s", buf);
5469         }
5470         if (npme > 0)
5471         {
5472             pme_f_ratio = comm->load_pme/comm->load_mdf;
5473             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5474             if (lossp <= 0)
5475             {
5476                 lossp *= (float)npme/(float)nnodes;
5477             }
5478             else
5479             {
5480                 lossp *= (float)npp/(float)nnodes;
5481             }
5482             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5483             fprintf(fplog, "%s", buf);
5484             fprintf(stderr, "%s", buf);
5485             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5486             fprintf(fplog, "%s", buf);
5487             fprintf(stderr, "%s", buf);
5488         }
5489         fprintf(fplog, "\n");
5490         fprintf(stderr, "\n");
5491
5492         if (lossf >= DD_PERF_LOSS_WARN)
5493         {
5494             sprintf(buf,
5495                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5496                     "      in the domain decomposition.\n", lossf*100);
5497             if (!comm->bDynLoadBal)
5498             {
5499                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5500             }
5501             else if (bLim)
5502             {
5503                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5504             }
5505             fprintf(fplog, "%s\n", buf);
5506             fprintf(stderr, "%s\n", buf);
5507         }
5508         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5509         {
5510             sprintf(buf,
5511                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5512                     "      had %s work to do than the PP ranks.\n"
5513                     "      You might want to %s the number of PME ranks\n"
5514                     "      or %s the cut-off and the grid spacing.\n",
5515                     fabs(lossp*100),
5516                     (lossp < 0) ? "less"     : "more",
5517                     (lossp < 0) ? "decrease" : "increase",
5518                     (lossp < 0) ? "decrease" : "increase");
5519             fprintf(fplog, "%s\n", buf);
5520             fprintf(stderr, "%s\n", buf);
5521         }
5522     }
5523 }
5524
5525 static float dd_vol_min(gmx_domdec_t *dd)
5526 {
5527     return dd->comm->load[0].cvol_min*dd->nnodes;
5528 }
5529
5530 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5531 {
5532     return dd->comm->load[0].flags;
5533 }
5534
5535 static float dd_f_imbal(gmx_domdec_t *dd)
5536 {
5537     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5538 }
5539
5540 float dd_pme_f_ratio(gmx_domdec_t *dd)
5541 {
5542     if (dd->comm->cycl_n[ddCyclPME] > 0)
5543     {
5544         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5545     }
5546     else
5547     {
5548         return -1.0;
5549     }
5550 }
5551
5552 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5553 {
5554     int  flags, d;
5555     char buf[22];
5556
5557     flags = dd_load_flags(dd);
5558     if (flags)
5559     {
5560         fprintf(fplog,
5561                 "DD  load balancing is limited by minimum cell size in dimension");
5562         for (d = 0; d < dd->ndim; d++)
5563         {
5564             if (flags & (1<<d))
5565             {
5566                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5567             }
5568         }
5569         fprintf(fplog, "\n");
5570     }
5571     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5572     if (dd->comm->bDynLoadBal)
5573     {
5574         fprintf(fplog, "  vol min/aver %5.3f%c",
5575                 dd_vol_min(dd), flags ? '!' : ' ');
5576     }
5577     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5578     if (dd->comm->cycl_n[ddCyclPME])
5579     {
5580         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5581     }
5582     fprintf(fplog, "\n\n");
5583 }
5584
5585 static void dd_print_load_verbose(gmx_domdec_t *dd)
5586 {
5587     if (dd->comm->bDynLoadBal)
5588     {
5589         fprintf(stderr, "vol %4.2f%c ",
5590                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5591     }
5592     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5593     if (dd->comm->cycl_n[ddCyclPME])
5594     {
5595         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5596     }
5597 }
5598
5599 #ifdef GMX_MPI
5600 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5601 {
5602     MPI_Comm           c_row;
5603     int                dim, i, rank;
5604     ivec               loc_c;
5605     gmx_domdec_root_t *root;
5606     gmx_bool           bPartOfGroup = FALSE;
5607
5608     dim = dd->dim[dim_ind];
5609     copy_ivec(loc, loc_c);
5610     for (i = 0; i < dd->nc[dim]; i++)
5611     {
5612         loc_c[dim] = i;
5613         rank       = dd_index(dd->nc, loc_c);
5614         if (rank == dd->rank)
5615         {
5616             /* This process is part of the group */
5617             bPartOfGroup = TRUE;
5618         }
5619     }
5620     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5621                    &c_row);
5622     if (bPartOfGroup)
5623     {
5624         dd->comm->mpi_comm_load[dim_ind] = c_row;
5625         if (dd->comm->eDLB != edlbNO)
5626         {
5627             if (dd->ci[dim] == dd->master_ci[dim])
5628             {
5629                 /* This is the root process of this row */
5630                 snew(dd->comm->root[dim_ind], 1);
5631                 root = dd->comm->root[dim_ind];
5632                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5633                 snew(root->old_cell_f, dd->nc[dim]+1);
5634                 snew(root->bCellMin, dd->nc[dim]);
5635                 if (dim_ind > 0)
5636                 {
5637                     snew(root->cell_f_max0, dd->nc[dim]);
5638                     snew(root->cell_f_min1, dd->nc[dim]);
5639                     snew(root->bound_min, dd->nc[dim]);
5640                     snew(root->bound_max, dd->nc[dim]);
5641                 }
5642                 snew(root->buf_ncd, dd->nc[dim]);
5643             }
5644             else
5645             {
5646                 /* This is not a root process, we only need to receive cell_f */
5647                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5648             }
5649         }
5650         if (dd->ci[dim] == dd->master_ci[dim])
5651         {
5652             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5653         }
5654     }
5655 }
5656 #endif
5657
5658 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5659                                    const gmx_hw_info_t gmx_unused *hwinfo,
5660                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5661 {
5662 #ifdef GMX_MPI
5663     int           physicalnode_id_hash;
5664     int           gpu_id;
5665     gmx_domdec_t *dd;
5666     MPI_Comm      mpi_comm_pp_physicalnode;
5667
5668     if (!(cr->duty & DUTY_PP) ||
5669         hw_opt->gpu_opt.ncuda_dev_use == 0)
5670     {
5671         /* Only PP nodes (currently) use GPUs.
5672          * If we don't have GPUs, there are no resources to share.
5673          */
5674         return;
5675     }
5676
5677     physicalnode_id_hash = gmx_physicalnode_id_hash();
5678
5679     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5680
5681     dd = cr->dd;
5682
5683     if (debug)
5684     {
5685         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5686         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5687                 dd->rank, physicalnode_id_hash, gpu_id);
5688     }
5689     /* Split the PP communicator over the physical nodes */
5690     /* TODO: See if we should store this (before), as it's also used for
5691      * for the nodecomm summution.
5692      */
5693     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5694                    &mpi_comm_pp_physicalnode);
5695     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5696                    &dd->comm->mpi_comm_gpu_shared);
5697     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5698     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5699
5700     if (debug)
5701     {
5702         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5703     }
5704
5705     /* Note that some ranks could share a GPU, while others don't */
5706
5707     if (dd->comm->nrank_gpu_shared == 1)
5708     {
5709         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5710     }
5711 #endif
5712 }
5713
5714 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5715 {
5716 #ifdef GMX_MPI
5717     int  dim0, dim1, i, j;
5718     ivec loc;
5719
5720     if (debug)
5721     {
5722         fprintf(debug, "Making load communicators\n");
5723     }
5724
5725     snew(dd->comm->load, dd->ndim);
5726     snew(dd->comm->mpi_comm_load, dd->ndim);
5727
5728     clear_ivec(loc);
5729     make_load_communicator(dd, 0, loc);
5730     if (dd->ndim > 1)
5731     {
5732         dim0 = dd->dim[0];
5733         for (i = 0; i < dd->nc[dim0]; i++)
5734         {
5735             loc[dim0] = i;
5736             make_load_communicator(dd, 1, loc);
5737         }
5738     }
5739     if (dd->ndim > 2)
5740     {
5741         dim0 = dd->dim[0];
5742         for (i = 0; i < dd->nc[dim0]; i++)
5743         {
5744             loc[dim0] = i;
5745             dim1      = dd->dim[1];
5746             for (j = 0; j < dd->nc[dim1]; j++)
5747             {
5748                 loc[dim1] = j;
5749                 make_load_communicator(dd, 2, loc);
5750             }
5751         }
5752     }
5753
5754     if (debug)
5755     {
5756         fprintf(debug, "Finished making load communicators\n");
5757     }
5758 #endif
5759 }
5760
5761 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5762 {
5763     gmx_bool                bZYX;
5764     int                     d, dim, i, j, m;
5765     ivec                    tmp, s;
5766     int                     nzone, nzonep;
5767     ivec                    dd_zp[DD_MAXIZONE];
5768     gmx_domdec_zones_t     *zones;
5769     gmx_domdec_ns_ranges_t *izone;
5770
5771     for (d = 0; d < dd->ndim; d++)
5772     {
5773         dim = dd->dim[d];
5774         copy_ivec(dd->ci, tmp);
5775         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5776         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5777         copy_ivec(dd->ci, tmp);
5778         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5779         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5780         if (debug)
5781         {
5782             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5783                     dd->rank, dim,
5784                     dd->neighbor[d][0],
5785                     dd->neighbor[d][1]);
5786         }
5787     }
5788
5789     if (fplog)
5790     {
5791         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5792                 dd->ndim,
5793                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5794                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5795     }
5796     switch (dd->ndim)
5797     {
5798         case 3:
5799             nzone  = dd_z3n;
5800             nzonep = dd_zp3n;
5801             for (i = 0; i < nzonep; i++)
5802             {
5803                 copy_ivec(dd_zp3[i], dd_zp[i]);
5804             }
5805             break;
5806         case 2:
5807             nzone  = dd_z2n;
5808             nzonep = dd_zp2n;
5809             for (i = 0; i < nzonep; i++)
5810             {
5811                 copy_ivec(dd_zp2[i], dd_zp[i]);
5812             }
5813             break;
5814         case 1:
5815             nzone  = dd_z1n;
5816             nzonep = dd_zp1n;
5817             for (i = 0; i < nzonep; i++)
5818             {
5819                 copy_ivec(dd_zp1[i], dd_zp[i]);
5820             }
5821             break;
5822         default:
5823             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5824             nzone  = 0;
5825             nzonep = 0;
5826     }
5827
5828     zones = &dd->comm->zones;
5829
5830     for (i = 0; i < nzone; i++)
5831     {
5832         m = 0;
5833         clear_ivec(zones->shift[i]);
5834         for (d = 0; d < dd->ndim; d++)
5835         {
5836             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5837         }
5838     }
5839
5840     zones->n = nzone;
5841     for (i = 0; i < nzone; i++)
5842     {
5843         for (d = 0; d < DIM; d++)
5844         {
5845             s[d] = dd->ci[d] - zones->shift[i][d];
5846             if (s[d] < 0)
5847             {
5848                 s[d] += dd->nc[d];
5849             }
5850             else if (s[d] >= dd->nc[d])
5851             {
5852                 s[d] -= dd->nc[d];
5853             }
5854         }
5855     }
5856     zones->nizone = nzonep;
5857     for (i = 0; i < zones->nizone; i++)
5858     {
5859         if (dd_zp[i][0] != i)
5860         {
5861             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5862         }
5863         izone     = &zones->izone[i];
5864         izone->j0 = dd_zp[i][1];
5865         izone->j1 = dd_zp[i][2];
5866         for (dim = 0; dim < DIM; dim++)
5867         {
5868             if (dd->nc[dim] == 1)
5869             {
5870                 /* All shifts should be allowed */
5871                 izone->shift0[dim] = -1;
5872                 izone->shift1[dim] = 1;
5873             }
5874             else
5875             {
5876                 /*
5877                    izone->shift0[d] = 0;
5878                    izone->shift1[d] = 0;
5879                    for(j=izone->j0; j<izone->j1; j++) {
5880                    if (dd->shift[j][d] > dd->shift[i][d])
5881                    izone->shift0[d] = -1;
5882                    if (dd->shift[j][d] < dd->shift[i][d])
5883                    izone->shift1[d] = 1;
5884                    }
5885                  */
5886
5887                 int shift_diff;
5888
5889                 /* Assume the shift are not more than 1 cell */
5890                 izone->shift0[dim] = 1;
5891                 izone->shift1[dim] = -1;
5892                 for (j = izone->j0; j < izone->j1; j++)
5893                 {
5894                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5895                     if (shift_diff < izone->shift0[dim])
5896                     {
5897                         izone->shift0[dim] = shift_diff;
5898                     }
5899                     if (shift_diff > izone->shift1[dim])
5900                     {
5901                         izone->shift1[dim] = shift_diff;
5902                     }
5903                 }
5904             }
5905         }
5906     }
5907
5908     if (dd->comm->eDLB != edlbNO)
5909     {
5910         snew(dd->comm->root, dd->ndim);
5911     }
5912
5913     if (dd->comm->bRecordLoad)
5914     {
5915         make_load_communicators(dd);
5916     }
5917 }
5918
5919 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5920 {
5921     gmx_domdec_t      *dd;
5922     gmx_domdec_comm_t *comm;
5923     int                i, rank, *buf;
5924     ivec               periods;
5925 #ifdef GMX_MPI
5926     MPI_Comm           comm_cart;
5927 #endif
5928
5929     dd   = cr->dd;
5930     comm = dd->comm;
5931
5932 #ifdef GMX_MPI
5933     if (comm->bCartesianPP)
5934     {
5935         /* Set up cartesian communication for the particle-particle part */
5936         if (fplog)
5937         {
5938             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5939                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5940         }
5941
5942         for (i = 0; i < DIM; i++)
5943         {
5944             periods[i] = TRUE;
5945         }
5946         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5947                         &comm_cart);
5948         /* We overwrite the old communicator with the new cartesian one */
5949         cr->mpi_comm_mygroup = comm_cart;
5950     }
5951
5952     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5953     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5954
5955     if (comm->bCartesianPP_PME)
5956     {
5957         /* Since we want to use the original cartesian setup for sim,
5958          * and not the one after split, we need to make an index.
5959          */
5960         snew(comm->ddindex2ddnodeid, dd->nnodes);
5961         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5962         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5963         /* Get the rank of the DD master,
5964          * above we made sure that the master node is a PP node.
5965          */
5966         if (MASTER(cr))
5967         {
5968             rank = dd->rank;
5969         }
5970         else
5971         {
5972             rank = 0;
5973         }
5974         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5975     }
5976     else if (comm->bCartesianPP)
5977     {
5978         if (cr->npmenodes == 0)
5979         {
5980             /* The PP communicator is also
5981              * the communicator for this simulation
5982              */
5983             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5984         }
5985         cr->nodeid = dd->rank;
5986
5987         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5988
5989         /* We need to make an index to go from the coordinates
5990          * to the nodeid of this simulation.
5991          */
5992         snew(comm->ddindex2simnodeid, dd->nnodes);
5993         snew(buf, dd->nnodes);
5994         if (cr->duty & DUTY_PP)
5995         {
5996             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5997         }
5998         /* Communicate the ddindex to simulation nodeid index */
5999         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6000                       cr->mpi_comm_mysim);
6001         sfree(buf);
6002
6003         /* Determine the master coordinates and rank.
6004          * The DD master should be the same node as the master of this sim.
6005          */
6006         for (i = 0; i < dd->nnodes; i++)
6007         {
6008             if (comm->ddindex2simnodeid[i] == 0)
6009             {
6010                 ddindex2xyz(dd->nc, i, dd->master_ci);
6011                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6012             }
6013         }
6014         if (debug)
6015         {
6016             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6017         }
6018     }
6019     else
6020     {
6021         /* No Cartesian communicators */
6022         /* We use the rank in dd->comm->all as DD index */
6023         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6024         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6025         dd->masterrank = 0;
6026         clear_ivec(dd->master_ci);
6027     }
6028 #endif
6029
6030     if (fplog)
6031     {
6032         fprintf(fplog,
6033                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6034                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6035     }
6036     if (debug)
6037     {
6038         fprintf(debug,
6039                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6040                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6041     }
6042 }
6043
6044 static void receive_ddindex2simnodeid(t_commrec *cr)
6045 {
6046     gmx_domdec_t      *dd;
6047
6048     gmx_domdec_comm_t *comm;
6049     int               *buf;
6050
6051     dd   = cr->dd;
6052     comm = dd->comm;
6053
6054 #ifdef GMX_MPI
6055     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6056     {
6057         snew(comm->ddindex2simnodeid, dd->nnodes);
6058         snew(buf, dd->nnodes);
6059         if (cr->duty & DUTY_PP)
6060         {
6061             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6062         }
6063 #ifdef GMX_MPI
6064         /* Communicate the ddindex to simulation nodeid index */
6065         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6066                       cr->mpi_comm_mysim);
6067 #endif
6068         sfree(buf);
6069     }
6070 #endif
6071 }
6072
6073 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6074                                                      int ncg, int natoms)
6075 {
6076     gmx_domdec_master_t *ma;
6077     int                  i;
6078
6079     snew(ma, 1);
6080
6081     snew(ma->ncg, dd->nnodes);
6082     snew(ma->index, dd->nnodes+1);
6083     snew(ma->cg, ncg);
6084     snew(ma->nat, dd->nnodes);
6085     snew(ma->ibuf, dd->nnodes*2);
6086     snew(ma->cell_x, DIM);
6087     for (i = 0; i < DIM; i++)
6088     {
6089         snew(ma->cell_x[i], dd->nc[i]+1);
6090     }
6091
6092     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6093     {
6094         ma->vbuf = NULL;
6095     }
6096     else
6097     {
6098         snew(ma->vbuf, natoms);
6099     }
6100
6101     return ma;
6102 }
6103
6104 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
6105                                int gmx_unused reorder)
6106 {
6107     gmx_domdec_t      *dd;
6108     gmx_domdec_comm_t *comm;
6109     int                i, rank;
6110     gmx_bool           bDiv[DIM];
6111     ivec               periods;
6112 #ifdef GMX_MPI
6113     MPI_Comm           comm_cart;
6114 #endif
6115
6116     dd   = cr->dd;
6117     comm = dd->comm;
6118
6119     if (comm->bCartesianPP)
6120     {
6121         for (i = 1; i < DIM; i++)
6122         {
6123             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6124         }
6125         if (bDiv[YY] || bDiv[ZZ])
6126         {
6127             comm->bCartesianPP_PME = TRUE;
6128             /* If we have 2D PME decomposition, which is always in x+y,
6129              * we stack the PME only nodes in z.
6130              * Otherwise we choose the direction that provides the thinnest slab
6131              * of PME only nodes as this will have the least effect
6132              * on the PP communication.
6133              * But for the PME communication the opposite might be better.
6134              */
6135             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6136                              !bDiv[YY] ||
6137                              dd->nc[YY] > dd->nc[ZZ]))
6138             {
6139                 comm->cartpmedim = ZZ;
6140             }
6141             else
6142             {
6143                 comm->cartpmedim = YY;
6144             }
6145             comm->ntot[comm->cartpmedim]
6146                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6147         }
6148         else if (fplog)
6149         {
6150             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6151             fprintf(fplog,
6152                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6153         }
6154     }
6155
6156 #ifdef GMX_MPI
6157     if (comm->bCartesianPP_PME)
6158     {
6159         if (fplog)
6160         {
6161             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6162         }
6163
6164         for (i = 0; i < DIM; i++)
6165         {
6166             periods[i] = TRUE;
6167         }
6168         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6169                         &comm_cart);
6170
6171         MPI_Comm_rank(comm_cart, &rank);
6172         if (MASTERNODE(cr) && rank != 0)
6173         {
6174             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6175         }
6176
6177         /* With this assigment we loose the link to the original communicator
6178          * which will usually be MPI_COMM_WORLD, unless have multisim.
6179          */
6180         cr->mpi_comm_mysim = comm_cart;
6181         cr->sim_nodeid     = rank;
6182
6183         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6184
6185         if (fplog)
6186         {
6187             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
6188                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6189         }
6190
6191         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6192         {
6193             cr->duty = DUTY_PP;
6194         }
6195         if (cr->npmenodes == 0 ||
6196             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6197         {
6198             cr->duty = DUTY_PME;
6199         }
6200
6201         /* Split the sim communicator into PP and PME only nodes */
6202         MPI_Comm_split(cr->mpi_comm_mysim,
6203                        cr->duty,
6204                        dd_index(comm->ntot, dd->ci),
6205                        &cr->mpi_comm_mygroup);
6206     }
6207     else
6208     {
6209         switch (dd_node_order)
6210         {
6211             case ddnoPP_PME:
6212                 if (fplog)
6213                 {
6214                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
6215                 }
6216                 break;
6217             case ddnoINTERLEAVE:
6218                 /* Interleave the PP-only and PME-only nodes,
6219                  * as on clusters with dual-core machines this will double
6220                  * the communication bandwidth of the PME processes
6221                  * and thus speed up the PP <-> PME and inter PME communication.
6222                  */
6223                 if (fplog)
6224                 {
6225                     fprintf(fplog, "Interleaving PP and PME ranks\n");
6226                 }
6227                 comm->pmenodes = dd_pmenodes(cr);
6228                 break;
6229             case ddnoCARTESIAN:
6230                 break;
6231             default:
6232                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6233         }
6234
6235         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6236         {
6237             cr->duty = DUTY_PME;
6238         }
6239         else
6240         {
6241             cr->duty = DUTY_PP;
6242         }
6243
6244         /* Split the sim communicator into PP and PME only nodes */
6245         MPI_Comm_split(cr->mpi_comm_mysim,
6246                        cr->duty,
6247                        cr->nodeid,
6248                        &cr->mpi_comm_mygroup);
6249         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6250     }
6251 #endif
6252
6253     if (fplog)
6254     {
6255         fprintf(fplog, "This rank does only %s work.\n\n",
6256                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6257     }
6258 }
6259
6260 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6261 {
6262     gmx_domdec_t      *dd;
6263     gmx_domdec_comm_t *comm;
6264     int                CartReorder;
6265
6266     dd   = cr->dd;
6267     comm = dd->comm;
6268
6269     copy_ivec(dd->nc, comm->ntot);
6270
6271     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6272     comm->bCartesianPP_PME = FALSE;
6273
6274     /* Reorder the nodes by default. This might change the MPI ranks.
6275      * Real reordering is only supported on very few architectures,
6276      * Blue Gene is one of them.
6277      */
6278     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6279
6280     if (cr->npmenodes > 0)
6281     {
6282         /* Split the communicator into a PP and PME part */
6283         split_communicator(fplog, cr, dd_node_order, CartReorder);
6284         if (comm->bCartesianPP_PME)
6285         {
6286             /* We (possibly) reordered the nodes in split_communicator,
6287              * so it is no longer required in make_pp_communicator.
6288              */
6289             CartReorder = FALSE;
6290         }
6291     }
6292     else
6293     {
6294         /* All nodes do PP and PME */
6295 #ifdef GMX_MPI
6296         /* We do not require separate communicators */
6297         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6298 #endif
6299     }
6300
6301     if (cr->duty & DUTY_PP)
6302     {
6303         /* Copy or make a new PP communicator */
6304         make_pp_communicator(fplog, cr, CartReorder);
6305     }
6306     else
6307     {
6308         receive_ddindex2simnodeid(cr);
6309     }
6310
6311     if (!(cr->duty & DUTY_PME))
6312     {
6313         /* Set up the commnuication to our PME node */
6314         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6315         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6316         if (debug)
6317         {
6318             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6319                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6320         }
6321     }
6322     else
6323     {
6324         dd->pme_nodeid = -1;
6325     }
6326
6327     if (DDMASTER(dd))
6328     {
6329         dd->ma = init_gmx_domdec_master_t(dd,
6330                                           comm->cgs_gl.nr,
6331                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6332     }
6333 }
6334
6335 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6336 {
6337     real  *slb_frac, tot;
6338     int    i, n;
6339     double dbl;
6340
6341     slb_frac = NULL;
6342     if (nc > 1 && size_string != NULL)
6343     {
6344         if (fplog)
6345         {
6346             fprintf(fplog, "Using static load balancing for the %s direction\n",
6347                     dir);
6348         }
6349         snew(slb_frac, nc);
6350         tot = 0;
6351         for (i = 0; i < nc; i++)
6352         {
6353             dbl = 0;
6354             sscanf(size_string, "%lf%n", &dbl, &n);
6355             if (dbl == 0)
6356             {
6357                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6358             }
6359             slb_frac[i]  = dbl;
6360             size_string += n;
6361             tot         += slb_frac[i];
6362         }
6363         /* Normalize */
6364         if (fplog)
6365         {
6366             fprintf(fplog, "Relative cell sizes:");
6367         }
6368         for (i = 0; i < nc; i++)
6369         {
6370             slb_frac[i] /= tot;
6371             if (fplog)
6372             {
6373                 fprintf(fplog, " %5.3f", slb_frac[i]);
6374             }
6375         }
6376         if (fplog)
6377         {
6378             fprintf(fplog, "\n");
6379         }
6380     }
6381
6382     return slb_frac;
6383 }
6384
6385 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6386 {
6387     int                  n, nmol, ftype;
6388     gmx_mtop_ilistloop_t iloop;
6389     t_ilist             *il;
6390
6391     n     = 0;
6392     iloop = gmx_mtop_ilistloop_init(mtop);
6393     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6394     {
6395         for (ftype = 0; ftype < F_NRE; ftype++)
6396         {
6397             if ((interaction_function[ftype].flags & IF_BOND) &&
6398                 NRAL(ftype) >  2)
6399             {
6400                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6401             }
6402         }
6403     }
6404
6405     return n;
6406 }
6407
6408 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6409 {
6410     char *val;
6411     int   nst;
6412
6413     nst = def;
6414     val = getenv(env_var);
6415     if (val)
6416     {
6417         if (sscanf(val, "%d", &nst) <= 0)
6418         {
6419             nst = 1;
6420         }
6421         if (fplog)
6422         {
6423             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6424                     env_var, val, nst);
6425         }
6426     }
6427
6428     return nst;
6429 }
6430
6431 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6432 {
6433     if (MASTER(cr))
6434     {
6435         fprintf(stderr, "\n%s\n", warn_string);
6436     }
6437     if (fplog)
6438     {
6439         fprintf(fplog, "\n%s\n", warn_string);
6440     }
6441 }
6442
6443 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6444                                   t_inputrec *ir, FILE *fplog)
6445 {
6446     if (ir->ePBC == epbcSCREW &&
6447         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6448     {
6449         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6450     }
6451
6452     if (ir->ns_type == ensSIMPLE)
6453     {
6454         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6455     }
6456
6457     if (ir->nstlist == 0)
6458     {
6459         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6460     }
6461
6462     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6463     {
6464         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6465     }
6466 }
6467
6468 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6469 {
6470     int  di, d;
6471     real r;
6472
6473     r = ddbox->box_size[XX];
6474     for (di = 0; di < dd->ndim; di++)
6475     {
6476         d = dd->dim[di];
6477         /* Check using the initial average cell size */
6478         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6479     }
6480
6481     return r;
6482 }
6483
6484 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6485                              const char *dlb_opt, gmx_bool bRecordLoad,
6486                              unsigned long Flags, t_inputrec *ir)
6487 {
6488     gmx_domdec_t *dd;
6489     int           eDLB = -1;
6490     char          buf[STRLEN];
6491
6492     switch (dlb_opt[0])
6493     {
6494         case 'a': eDLB = edlbAUTO; break;
6495         case 'n': eDLB = edlbNO;   break;
6496         case 'y': eDLB = edlbYES;  break;
6497         default: gmx_incons("Unknown dlb_opt");
6498     }
6499
6500     if (Flags & MD_RERUN)
6501     {
6502         return edlbNO;
6503     }
6504
6505     if (!EI_DYNAMICS(ir->eI))
6506     {
6507         if (eDLB == edlbYES)
6508         {
6509             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6510             dd_warning(cr, fplog, buf);
6511         }
6512
6513         return edlbNO;
6514     }
6515
6516     if (!bRecordLoad)
6517     {
6518         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6519
6520         return edlbNO;
6521     }
6522
6523     if (Flags & MD_REPRODUCIBLE)
6524     {
6525         switch (eDLB)
6526         {
6527             case edlbNO:
6528                 break;
6529             case edlbAUTO:
6530                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6531                 eDLB = edlbNO;
6532                 break;
6533             case edlbYES:
6534                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6535                 break;
6536             default:
6537                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6538                 break;
6539         }
6540     }
6541
6542     return eDLB;
6543 }
6544
6545 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6546 {
6547     int dim;
6548
6549     dd->ndim = 0;
6550     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6551     {
6552         /* Decomposition order z,y,x */
6553         if (fplog)
6554         {
6555             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6556         }
6557         for (dim = DIM-1; dim >= 0; dim--)
6558         {
6559             if (dd->nc[dim] > 1)
6560             {
6561                 dd->dim[dd->ndim++] = dim;
6562             }
6563         }
6564     }
6565     else
6566     {
6567         /* Decomposition order x,y,z */
6568         for (dim = 0; dim < DIM; dim++)
6569         {
6570             if (dd->nc[dim] > 1)
6571             {
6572                 dd->dim[dd->ndim++] = dim;
6573             }
6574         }
6575     }
6576 }
6577
6578 static gmx_domdec_comm_t *init_dd_comm()
6579 {
6580     gmx_domdec_comm_t *comm;
6581     int                i;
6582
6583     snew(comm, 1);
6584     snew(comm->cggl_flag, DIM*2);
6585     snew(comm->cgcm_state, DIM*2);
6586     for (i = 0; i < DIM*2; i++)
6587     {
6588         comm->cggl_flag_nalloc[i]  = 0;
6589         comm->cgcm_state_nalloc[i] = 0;
6590     }
6591
6592     comm->nalloc_int = 0;
6593     comm->buf_int    = NULL;
6594
6595     vec_rvec_init(&comm->vbuf);
6596
6597     comm->n_load_have    = 0;
6598     comm->n_load_collect = 0;
6599
6600     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6601     {
6602         comm->sum_nat[i] = 0;
6603     }
6604     comm->ndecomp   = 0;
6605     comm->nload     = 0;
6606     comm->load_step = 0;
6607     comm->load_sum  = 0;
6608     comm->load_max  = 0;
6609     clear_ivec(comm->load_lim);
6610     comm->load_mdf  = 0;
6611     comm->load_pme  = 0;
6612
6613     return comm;
6614 }
6615
6616 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6617                                         unsigned long Flags,
6618                                         ivec nc,
6619                                         real comm_distance_min, real rconstr,
6620                                         const char *dlb_opt, real dlb_scale,
6621                                         const char *sizex, const char *sizey, const char *sizez,
6622                                         gmx_mtop_t *mtop, t_inputrec *ir,
6623                                         matrix box, rvec *x,
6624                                         gmx_ddbox_t *ddbox,
6625                                         int *npme_x, int *npme_y)
6626 {
6627     gmx_domdec_t      *dd;
6628     gmx_domdec_comm_t *comm;
6629     int                recload;
6630     int                d, i, j;
6631     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6632     gmx_bool           bC;
6633     char               buf[STRLEN];
6634
6635     if (fplog)
6636     {
6637         fprintf(fplog,
6638                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6639     }
6640
6641     snew(dd, 1);
6642
6643     dd->comm = init_dd_comm();
6644     comm     = dd->comm;
6645     snew(comm->cggl_flag, DIM*2);
6646     snew(comm->cgcm_state, DIM*2);
6647
6648     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6649     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6650
6651     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6652     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6653     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6654     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6655     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6656     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6657     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6658     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6659
6660     dd->pme_recv_f_alloc = 0;
6661     dd->pme_recv_f_buf   = NULL;
6662
6663     if (dd->bSendRecv2 && fplog)
6664     {
6665         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6666     }
6667     if (comm->eFlop)
6668     {
6669         if (fplog)
6670         {
6671             fprintf(fplog, "Will load balance based on FLOP count\n");
6672         }
6673         if (comm->eFlop > 1)
6674         {
6675             srand(1+cr->nodeid);
6676         }
6677         comm->bRecordLoad = TRUE;
6678     }
6679     else
6680     {
6681         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6682
6683     }
6684
6685     /* Initialize to GPU share count to 0, might change later */
6686     comm->nrank_gpu_shared = 0;
6687
6688     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6689
6690     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6691     if (fplog)
6692     {
6693         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6694     }
6695     dd->bGridJump              = comm->bDynLoadBal;
6696     comm->bPMELoadBalDLBLimits = FALSE;
6697
6698     if (comm->nstSortCG)
6699     {
6700         if (fplog)
6701         {
6702             if (comm->nstSortCG == 1)
6703             {
6704                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6705             }
6706             else
6707             {
6708                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6709                         comm->nstSortCG);
6710             }
6711         }
6712         snew(comm->sort, 1);
6713     }
6714     else
6715     {
6716         if (fplog)
6717         {
6718             fprintf(fplog, "Will not sort the charge groups\n");
6719         }
6720     }
6721
6722     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6723
6724     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6725     if (comm->bInterCGBondeds)
6726     {
6727         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6728     }
6729     else
6730     {
6731         comm->bInterCGMultiBody = FALSE;
6732     }
6733
6734     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6735     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6736
6737     if (ir->rlistlong == 0)
6738     {
6739         /* Set the cut-off to some very large value,
6740          * so we don't need if statements everywhere in the code.
6741          * We use sqrt, since the cut-off is squared in some places.
6742          */
6743         comm->cutoff   = GMX_CUTOFF_INF;
6744     }
6745     else
6746     {
6747         comm->cutoff   = ir->rlistlong;
6748     }
6749     comm->cutoff_mbody = 0;
6750
6751     comm->cellsize_limit = 0;
6752     comm->bBondComm      = FALSE;
6753
6754     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6755      * within nstlist steps. Since boundaries are allowed to displace by half
6756      * a cell size, DD cells should be at least the size of the list buffer.
6757      */
6758     comm->cellsize_limit = max(comm->cellsize_limit,
6759                                ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
6760
6761     if (comm->bInterCGBondeds)
6762     {
6763         if (comm_distance_min > 0)
6764         {
6765             comm->cutoff_mbody = comm_distance_min;
6766             if (Flags & MD_DDBONDCOMM)
6767             {
6768                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6769             }
6770             else
6771             {
6772                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6773             }
6774             r_bonded_limit = comm->cutoff_mbody;
6775         }
6776         else if (ir->bPeriodicMols)
6777         {
6778             /* Can not easily determine the required cut-off */
6779             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6780             comm->cutoff_mbody = comm->cutoff/2;
6781             r_bonded_limit     = comm->cutoff_mbody;
6782         }
6783         else
6784         {
6785             if (MASTER(cr))
6786             {
6787                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6788                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6789             }
6790             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6791             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6792
6793             /* We use an initial margin of 10% for the minimum cell size,
6794              * except when we are just below the non-bonded cut-off.
6795              */
6796             if (Flags & MD_DDBONDCOMM)
6797             {
6798                 if (max(r_2b, r_mb) > comm->cutoff)
6799                 {
6800                     r_bonded        = max(r_2b, r_mb);
6801                     r_bonded_limit  = 1.1*r_bonded;
6802                     comm->bBondComm = TRUE;
6803                 }
6804                 else
6805                 {
6806                     r_bonded       = r_mb;
6807                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6808                 }
6809                 /* We determine cutoff_mbody later */
6810             }
6811             else
6812             {
6813                 /* No special bonded communication,
6814                  * simply increase the DD cut-off.
6815                  */
6816                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6817                 comm->cutoff_mbody = r_bonded_limit;
6818                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6819             }
6820         }
6821         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6822         if (fplog)
6823         {
6824             fprintf(fplog,
6825                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6826                     comm->cellsize_limit);
6827         }
6828     }
6829
6830     if (dd->bInterCGcons && rconstr <= 0)
6831     {
6832         /* There is a cell size limit due to the constraints (P-LINCS) */
6833         rconstr = constr_r_max(fplog, mtop, ir);
6834         if (fplog)
6835         {
6836             fprintf(fplog,
6837                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6838                     rconstr);
6839             if (rconstr > comm->cellsize_limit)
6840             {
6841                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6842             }
6843         }
6844     }
6845     else if (rconstr > 0 && fplog)
6846     {
6847         /* Here we do not check for dd->bInterCGcons,
6848          * because one can also set a cell size limit for virtual sites only
6849          * and at this point we don't know yet if there are intercg v-sites.
6850          */
6851         fprintf(fplog,
6852                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6853                 rconstr);
6854     }
6855     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6856
6857     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6858
6859     if (nc[XX] > 0)
6860     {
6861         copy_ivec(nc, dd->nc);
6862         set_dd_dim(fplog, dd);
6863         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6864
6865         if (cr->npmenodes == -1)
6866         {
6867             cr->npmenodes = 0;
6868         }
6869         acs = average_cellsize_min(dd, ddbox);
6870         if (acs < comm->cellsize_limit)
6871         {
6872             if (fplog)
6873             {
6874                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6875             }
6876             gmx_fatal_collective(FARGS, cr, NULL,
6877                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6878                                  acs, comm->cellsize_limit);
6879         }
6880     }
6881     else
6882     {
6883         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6884
6885         /* We need to choose the optimal DD grid and possibly PME nodes */
6886         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6887                                comm->eDLB != edlbNO, dlb_scale,
6888                                comm->cellsize_limit, comm->cutoff,
6889                                comm->bInterCGBondeds);
6890
6891         if (dd->nc[XX] == 0)
6892         {
6893             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6894             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6895                     !bC ? "-rdd" : "-rcon",
6896                     comm->eDLB != edlbNO ? " or -dds" : "",
6897                     bC ? " or your LINCS settings" : "");
6898
6899             gmx_fatal_collective(FARGS, cr, NULL,
6900                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6901                                  "%s\n"
6902                                  "Look in the log file for details on the domain decomposition",
6903                                  cr->nnodes-cr->npmenodes, limit, buf);
6904         }
6905         set_dd_dim(fplog, dd);
6906     }
6907
6908     if (fplog)
6909     {
6910         fprintf(fplog,
6911                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6912                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6913     }
6914
6915     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6916     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6917     {
6918         gmx_fatal_collective(FARGS, cr, NULL,
6919                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6920                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6921     }
6922     if (cr->npmenodes > dd->nnodes)
6923     {
6924         gmx_fatal_collective(FARGS, cr, NULL,
6925                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6926     }
6927     if (cr->npmenodes > 0)
6928     {
6929         comm->npmenodes = cr->npmenodes;
6930     }
6931     else
6932     {
6933         comm->npmenodes = dd->nnodes;
6934     }
6935
6936     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6937     {
6938         /* The following choices should match those
6939          * in comm_cost_est in domdec_setup.c.
6940          * Note that here the checks have to take into account
6941          * that the decomposition might occur in a different order than xyz
6942          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6943          * in which case they will not match those in comm_cost_est,
6944          * but since that is mainly for testing purposes that's fine.
6945          */
6946         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6947             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6948             getenv("GMX_PMEONEDD") == NULL)
6949         {
6950             comm->npmedecompdim = 2;
6951             comm->npmenodes_x   = dd->nc[XX];
6952             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6953         }
6954         else
6955         {
6956             /* In case nc is 1 in both x and y we could still choose to
6957              * decompose pme in y instead of x, but we use x for simplicity.
6958              */
6959             comm->npmedecompdim = 1;
6960             if (dd->dim[0] == YY)
6961             {
6962                 comm->npmenodes_x = 1;
6963                 comm->npmenodes_y = comm->npmenodes;
6964             }
6965             else
6966             {
6967                 comm->npmenodes_x = comm->npmenodes;
6968                 comm->npmenodes_y = 1;
6969             }
6970         }
6971         if (fplog)
6972         {
6973             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6974                     comm->npmenodes_x, comm->npmenodes_y, 1);
6975         }
6976     }
6977     else
6978     {
6979         comm->npmedecompdim = 0;
6980         comm->npmenodes_x   = 0;
6981         comm->npmenodes_y   = 0;
6982     }
6983
6984     /* Technically we don't need both of these,
6985      * but it simplifies code not having to recalculate it.
6986      */
6987     *npme_x = comm->npmenodes_x;
6988     *npme_y = comm->npmenodes_y;
6989
6990     snew(comm->slb_frac, DIM);
6991     if (comm->eDLB == edlbNO)
6992     {
6993         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6994         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6995         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6996     }
6997
6998     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6999     {
7000         if (comm->bBondComm || comm->eDLB != edlbNO)
7001         {
7002             /* Set the bonded communication distance to halfway
7003              * the minimum and the maximum,
7004              * since the extra communication cost is nearly zero.
7005              */
7006             acs                = average_cellsize_min(dd, ddbox);
7007             comm->cutoff_mbody = 0.5*(r_bonded + acs);
7008             if (comm->eDLB != edlbNO)
7009             {
7010                 /* Check if this does not limit the scaling */
7011                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
7012             }
7013             if (!comm->bBondComm)
7014             {
7015                 /* Without bBondComm do not go beyond the n.b. cut-off */
7016                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
7017                 if (comm->cellsize_limit >= comm->cutoff)
7018                 {
7019                     /* We don't loose a lot of efficieny
7020                      * when increasing it to the n.b. cut-off.
7021                      * It can even be slightly faster, because we need
7022                      * less checks for the communication setup.
7023                      */
7024                     comm->cutoff_mbody = comm->cutoff;
7025                 }
7026             }
7027             /* Check if we did not end up below our original limit */
7028             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
7029
7030             if (comm->cutoff_mbody > comm->cellsize_limit)
7031             {
7032                 comm->cellsize_limit = comm->cutoff_mbody;
7033             }
7034         }
7035         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7036     }
7037
7038     if (debug)
7039     {
7040         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7041                 "cellsize limit %f\n",
7042                 comm->bBondComm, comm->cellsize_limit);
7043     }
7044
7045     if (MASTER(cr))
7046     {
7047         check_dd_restrictions(cr, dd, ir, fplog);
7048     }
7049
7050     comm->partition_step = INT_MIN;
7051     dd->ddp_count        = 0;
7052
7053     clear_dd_cycle_counts(dd);
7054
7055     return dd;
7056 }
7057
7058 static void set_dlb_limits(gmx_domdec_t *dd)
7059
7060 {
7061     int d;
7062
7063     for (d = 0; d < dd->ndim; d++)
7064     {
7065         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7066         dd->comm->cellsize_min[dd->dim[d]] =
7067             dd->comm->cellsize_min_dlb[dd->dim[d]];
7068     }
7069 }
7070
7071
7072 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
7073 {
7074     gmx_domdec_t      *dd;
7075     gmx_domdec_comm_t *comm;
7076     real               cellsize_min;
7077     int                d, nc, i;
7078     char               buf[STRLEN];
7079
7080     dd   = cr->dd;
7081     comm = dd->comm;
7082
7083     if (fplog)
7084     {
7085         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7086     }
7087
7088     cellsize_min = comm->cellsize_min[dd->dim[0]];
7089     for (d = 1; d < dd->ndim; d++)
7090     {
7091         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7092     }
7093
7094     if (cellsize_min < comm->cellsize_limit*1.05)
7095     {
7096         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7097
7098         /* Change DLB from "auto" to "no". */
7099         comm->eDLB = edlbNO;
7100
7101         return;
7102     }
7103
7104     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7105     comm->bDynLoadBal = TRUE;
7106     dd->bGridJump     = TRUE;
7107
7108     set_dlb_limits(dd);
7109
7110     /* We can set the required cell size info here,
7111      * so we do not need to communicate this.
7112      * The grid is completely uniform.
7113      */
7114     for (d = 0; d < dd->ndim; d++)
7115     {
7116         if (comm->root[d])
7117         {
7118             comm->load[d].sum_m = comm->load[d].sum;
7119
7120             nc = dd->nc[dd->dim[d]];
7121             for (i = 0; i < nc; i++)
7122             {
7123                 comm->root[d]->cell_f[i]    = i/(real)nc;
7124                 if (d > 0)
7125                 {
7126                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7127                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7128                 }
7129             }
7130             comm->root[d]->cell_f[nc] = 1.0;
7131         }
7132     }
7133 }
7134
7135 static char *init_bLocalCG(gmx_mtop_t *mtop)
7136 {
7137     int   ncg, cg;
7138     char *bLocalCG;
7139
7140     ncg = ncg_mtop(mtop);
7141     snew(bLocalCG, ncg);
7142     for (cg = 0; cg < ncg; cg++)
7143     {
7144         bLocalCG[cg] = FALSE;
7145     }
7146
7147     return bLocalCG;
7148 }
7149
7150 void dd_init_bondeds(FILE *fplog,
7151                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7152                      gmx_vsite_t *vsite,
7153                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7154 {
7155     gmx_domdec_comm_t *comm;
7156     gmx_bool           bBondComm;
7157     int                d;
7158
7159     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7160
7161     comm = dd->comm;
7162
7163     if (comm->bBondComm)
7164     {
7165         /* Communicate atoms beyond the cut-off for bonded interactions */
7166         comm = dd->comm;
7167
7168         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7169
7170         comm->bLocalCG = init_bLocalCG(mtop);
7171     }
7172     else
7173     {
7174         /* Only communicate atoms based on cut-off */
7175         comm->cglink   = NULL;
7176         comm->bLocalCG = NULL;
7177     }
7178 }
7179
7180 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7181                               t_inputrec *ir,
7182                               gmx_bool bDynLoadBal, real dlb_scale,
7183                               gmx_ddbox_t *ddbox)
7184 {
7185     gmx_domdec_comm_t *comm;
7186     int                d;
7187     ivec               np;
7188     real               limit, shrink;
7189     char               buf[64];
7190
7191     if (fplog == NULL)
7192     {
7193         return;
7194     }
7195
7196     comm = dd->comm;
7197
7198     if (bDynLoadBal)
7199     {
7200         fprintf(fplog, "The maximum number of communication pulses is:");
7201         for (d = 0; d < dd->ndim; d++)
7202         {
7203             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7204         }
7205         fprintf(fplog, "\n");
7206         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7207         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7208         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7209         for (d = 0; d < DIM; d++)
7210         {
7211             if (dd->nc[d] > 1)
7212             {
7213                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7214                 {
7215                     shrink = 0;
7216                 }
7217                 else
7218                 {
7219                     shrink =
7220                         comm->cellsize_min_dlb[d]/
7221                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7222                 }
7223                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7224             }
7225         }
7226         fprintf(fplog, "\n");
7227     }
7228     else
7229     {
7230         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
7231         fprintf(fplog, "The initial number of communication pulses is:");
7232         for (d = 0; d < dd->ndim; d++)
7233         {
7234             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7235         }
7236         fprintf(fplog, "\n");
7237         fprintf(fplog, "The initial domain decomposition cell size is:");
7238         for (d = 0; d < DIM; d++)
7239         {
7240             if (dd->nc[d] > 1)
7241             {
7242                 fprintf(fplog, " %c %.2f nm",
7243                         dim2char(d), dd->comm->cellsize_min[d]);
7244             }
7245         }
7246         fprintf(fplog, "\n\n");
7247     }
7248
7249     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7250     {
7251         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7252         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7253                 "non-bonded interactions", "", comm->cutoff);
7254
7255         if (bDynLoadBal)
7256         {
7257             limit = dd->comm->cellsize_limit;
7258         }
7259         else
7260         {
7261             if (dynamic_dd_box(ddbox, ir))
7262             {
7263                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7264             }
7265             limit = dd->comm->cellsize_min[XX];
7266             for (d = 1; d < DIM; d++)
7267             {
7268                 limit = min(limit, dd->comm->cellsize_min[d]);
7269             }
7270         }
7271
7272         if (comm->bInterCGBondeds)
7273         {
7274             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7275                     "two-body bonded interactions", "(-rdd)",
7276                     max(comm->cutoff, comm->cutoff_mbody));
7277             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7278                     "multi-body bonded interactions", "(-rdd)",
7279                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7280         }
7281         if (dd->vsite_comm)
7282         {
7283             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7284                     "virtual site constructions", "(-rcon)", limit);
7285         }
7286         if (dd->constraint_comm)
7287         {
7288             sprintf(buf, "atoms separated by up to %d constraints",
7289                     1+ir->nProjOrder);
7290             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7291                     buf, "(-rcon)", limit);
7292         }
7293         fprintf(fplog, "\n");
7294     }
7295
7296     fflush(fplog);
7297 }
7298
7299 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7300                                 real               dlb_scale,
7301                                 const t_inputrec  *ir,
7302                                 const gmx_ddbox_t *ddbox)
7303 {
7304     gmx_domdec_comm_t *comm;
7305     int                d, dim, npulse, npulse_d_max, npulse_d;
7306     gmx_bool           bNoCutOff;
7307
7308     comm = dd->comm;
7309
7310     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7311
7312     /* Determine the maximum number of comm. pulses in one dimension */
7313
7314     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7315
7316     /* Determine the maximum required number of grid pulses */
7317     if (comm->cellsize_limit >= comm->cutoff)
7318     {
7319         /* Only a single pulse is required */
7320         npulse = 1;
7321     }
7322     else if (!bNoCutOff && comm->cellsize_limit > 0)
7323     {
7324         /* We round down slightly here to avoid overhead due to the latency
7325          * of extra communication calls when the cut-off
7326          * would be only slightly longer than the cell size.
7327          * Later cellsize_limit is redetermined,
7328          * so we can not miss interactions due to this rounding.
7329          */
7330         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7331     }
7332     else
7333     {
7334         /* There is no cell size limit */
7335         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7336     }
7337
7338     if (!bNoCutOff && npulse > 1)
7339     {
7340         /* See if we can do with less pulses, based on dlb_scale */
7341         npulse_d_max = 0;
7342         for (d = 0; d < dd->ndim; d++)
7343         {
7344             dim      = dd->dim[d];
7345             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7346                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7347             npulse_d_max = max(npulse_d_max, npulse_d);
7348         }
7349         npulse = min(npulse, npulse_d_max);
7350     }
7351
7352     /* This env var can override npulse */
7353     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7354     if (d > 0)
7355     {
7356         npulse = d;
7357     }
7358
7359     comm->maxpulse       = 1;
7360     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7361     for (d = 0; d < dd->ndim; d++)
7362     {
7363         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7364         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7365         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7366         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7367         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7368         {
7369             comm->bVacDLBNoLimit = FALSE;
7370         }
7371     }
7372
7373     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7374     if (!comm->bVacDLBNoLimit)
7375     {
7376         comm->cellsize_limit = max(comm->cellsize_limit,
7377                                    comm->cutoff/comm->maxpulse);
7378     }
7379     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7380     /* Set the minimum cell size for each DD dimension */
7381     for (d = 0; d < dd->ndim; d++)
7382     {
7383         if (comm->bVacDLBNoLimit ||
7384             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7385         {
7386             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7387         }
7388         else
7389         {
7390             comm->cellsize_min_dlb[dd->dim[d]] =
7391                 comm->cutoff/comm->cd[d].np_dlb;
7392         }
7393     }
7394     if (comm->cutoff_mbody <= 0)
7395     {
7396         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7397     }
7398     if (comm->bDynLoadBal)
7399     {
7400         set_dlb_limits(dd);
7401     }
7402 }
7403
7404 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7405 {
7406     /* If each molecule is a single charge group
7407      * or we use domain decomposition for each periodic dimension,
7408      * we do not need to take pbc into account for the bonded interactions.
7409      */
7410     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7411             !(dd->nc[XX] > 1 &&
7412               dd->nc[YY] > 1 &&
7413               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7414 }
7415
7416 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7417                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7418 {
7419     gmx_domdec_comm_t *comm;
7420     int                natoms_tot;
7421     real               vol_frac;
7422
7423     comm = dd->comm;
7424
7425     /* Initialize the thread data.
7426      * This can not be done in init_domain_decomposition,
7427      * as the numbers of threads is determined later.
7428      */
7429     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7430     if (comm->nth > 1)
7431     {
7432         snew(comm->dth, comm->nth);
7433     }
7434
7435     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7436     {
7437         init_ddpme(dd, &comm->ddpme[0], 0);
7438         if (comm->npmedecompdim >= 2)
7439         {
7440             init_ddpme(dd, &comm->ddpme[1], 1);
7441         }
7442     }
7443     else
7444     {
7445         comm->npmenodes = 0;
7446         if (dd->pme_nodeid >= 0)
7447         {
7448             gmx_fatal_collective(FARGS, NULL, dd,
7449                                  "Can not have separate PME ranks without PME electrostatics");
7450         }
7451     }
7452
7453     if (debug)
7454     {
7455         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7456     }
7457     if (comm->eDLB != edlbNO)
7458     {
7459         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7460     }
7461
7462     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7463     if (comm->eDLB == edlbAUTO)
7464     {
7465         if (fplog)
7466         {
7467             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7468         }
7469         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7470     }
7471
7472     if (ir->ePBC == epbcNONE)
7473     {
7474         vol_frac = 1 - 1/(double)dd->nnodes;
7475     }
7476     else
7477     {
7478         vol_frac =
7479             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7480     }
7481     if (debug)
7482     {
7483         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7484     }
7485     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7486
7487     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7488 }
7489
7490 static gmx_bool test_dd_cutoff(t_commrec *cr,
7491                                t_state *state, t_inputrec *ir,
7492                                real cutoff_req)
7493 {
7494     gmx_domdec_t *dd;
7495     gmx_ddbox_t   ddbox;
7496     int           d, dim, np;
7497     real          inv_cell_size;
7498     int           LocallyLimited;
7499
7500     dd = cr->dd;
7501
7502     set_ddbox(dd, FALSE, cr, ir, state->box,
7503               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7504
7505     LocallyLimited = 0;
7506
7507     for (d = 0; d < dd->ndim; d++)
7508     {
7509         dim = dd->dim[d];
7510
7511         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7512         if (dynamic_dd_box(&ddbox, ir))
7513         {
7514             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7515         }
7516
7517         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7518
7519         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7520             dd->comm->cd[d].np_dlb > 0)
7521         {
7522             if (np > dd->comm->cd[d].np_dlb)
7523             {
7524                 return FALSE;
7525             }
7526
7527             /* If a current local cell size is smaller than the requested
7528              * cut-off, we could still fix it, but this gets very complicated.
7529              * Without fixing here, we might actually need more checks.
7530              */
7531             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7532             {
7533                 LocallyLimited = 1;
7534             }
7535         }
7536     }
7537
7538     if (dd->comm->eDLB != edlbNO)
7539     {
7540         /* If DLB is not active yet, we don't need to check the grid jumps.
7541          * Actually we shouldn't, because then the grid jump data is not set.
7542          */
7543         if (dd->comm->bDynLoadBal &&
7544             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7545         {
7546             LocallyLimited = 1;
7547         }
7548
7549         gmx_sumi(1, &LocallyLimited, cr);
7550
7551         if (LocallyLimited > 0)
7552         {
7553             return FALSE;
7554         }
7555     }
7556
7557     return TRUE;
7558 }
7559
7560 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7561                           real cutoff_req)
7562 {
7563     gmx_bool bCutoffAllowed;
7564
7565     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7566
7567     if (bCutoffAllowed)
7568     {
7569         cr->dd->comm->cutoff = cutoff_req;
7570     }
7571
7572     return bCutoffAllowed;
7573 }
7574
7575 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7576 {
7577     gmx_domdec_comm_t *comm;
7578
7579     comm = cr->dd->comm;
7580
7581     /* Turn on the DLB limiting (might have been on already) */
7582     comm->bPMELoadBalDLBLimits = TRUE;
7583
7584     /* Change the cut-off limit */
7585     comm->PMELoadBal_max_cutoff = comm->cutoff;
7586 }
7587
7588 static void merge_cg_buffers(int ncell,
7589                              gmx_domdec_comm_dim_t *cd, int pulse,
7590                              int  *ncg_cell,
7591                              int  *index_gl, int  *recv_i,
7592                              rvec *cg_cm,    rvec *recv_vr,
7593                              int *cgindex,
7594                              cginfo_mb_t *cginfo_mb, int *cginfo)
7595 {
7596     gmx_domdec_ind_t *ind, *ind_p;
7597     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7598     int               shift, shift_at;
7599
7600     ind = &cd->ind[pulse];
7601
7602     /* First correct the already stored data */
7603     shift = ind->nrecv[ncell];
7604     for (cell = ncell-1; cell >= 0; cell--)
7605     {
7606         shift -= ind->nrecv[cell];
7607         if (shift > 0)
7608         {
7609             /* Move the cg's present from previous grid pulses */
7610             cg0                = ncg_cell[ncell+cell];
7611             cg1                = ncg_cell[ncell+cell+1];
7612             cgindex[cg1+shift] = cgindex[cg1];
7613             for (cg = cg1-1; cg >= cg0; cg--)
7614             {
7615                 index_gl[cg+shift] = index_gl[cg];
7616                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7617                 cgindex[cg+shift] = cgindex[cg];
7618                 cginfo[cg+shift]  = cginfo[cg];
7619             }
7620             /* Correct the already stored send indices for the shift */
7621             for (p = 1; p <= pulse; p++)
7622             {
7623                 ind_p = &cd->ind[p];
7624                 cg0   = 0;
7625                 for (c = 0; c < cell; c++)
7626                 {
7627                     cg0 += ind_p->nsend[c];
7628                 }
7629                 cg1 = cg0 + ind_p->nsend[cell];
7630                 for (cg = cg0; cg < cg1; cg++)
7631                 {
7632                     ind_p->index[cg] += shift;
7633                 }
7634             }
7635         }
7636     }
7637
7638     /* Merge in the communicated buffers */
7639     shift    = 0;
7640     shift_at = 0;
7641     cg0      = 0;
7642     for (cell = 0; cell < ncell; cell++)
7643     {
7644         cg1 = ncg_cell[ncell+cell+1] + shift;
7645         if (shift_at > 0)
7646         {
7647             /* Correct the old cg indices */
7648             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7649             {
7650                 cgindex[cg+1] += shift_at;
7651             }
7652         }
7653         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7654         {
7655             /* Copy this charge group from the buffer */
7656             index_gl[cg1] = recv_i[cg0];
7657             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7658             /* Add it to the cgindex */
7659             cg_gl          = index_gl[cg1];
7660             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7661             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7662             cgindex[cg1+1] = cgindex[cg1] + nat;
7663             cg0++;
7664             cg1++;
7665             shift_at += nat;
7666         }
7667         shift                 += ind->nrecv[cell];
7668         ncg_cell[ncell+cell+1] = cg1;
7669     }
7670 }
7671
7672 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7673                                int nzone, int cg0, const int *cgindex)
7674 {
7675     int cg, zone, p;
7676
7677     /* Store the atom block boundaries for easy copying of communication buffers
7678      */
7679     cg = cg0;
7680     for (zone = 0; zone < nzone; zone++)
7681     {
7682         for (p = 0; p < cd->np; p++)
7683         {
7684             cd->ind[p].cell2at0[zone] = cgindex[cg];
7685             cg += cd->ind[p].nrecv[zone];
7686             cd->ind[p].cell2at1[zone] = cgindex[cg];
7687         }
7688     }
7689 }
7690
7691 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7692 {
7693     int      i;
7694     gmx_bool bMiss;
7695
7696     bMiss = FALSE;
7697     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7698     {
7699         if (!bLocalCG[link->a[i]])
7700         {
7701             bMiss = TRUE;
7702         }
7703     }
7704
7705     return bMiss;
7706 }
7707
7708 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7709 typedef struct {
7710     real c[DIM][4]; /* the corners for the non-bonded communication */
7711     real cr0;       /* corner for rounding */
7712     real cr1[4];    /* corners for rounding */
7713     real bc[DIM];   /* corners for bounded communication */
7714     real bcr1;      /* corner for rounding for bonded communication */
7715 } dd_corners_t;
7716
7717 /* Determine the corners of the domain(s) we are communicating with */
7718 static void
7719 set_dd_corners(const gmx_domdec_t *dd,
7720                int dim0, int dim1, int dim2,
7721                gmx_bool bDistMB,
7722                dd_corners_t *c)
7723 {
7724     const gmx_domdec_comm_t  *comm;
7725     const gmx_domdec_zones_t *zones;
7726     int i, j;
7727
7728     comm = dd->comm;
7729
7730     zones = &comm->zones;
7731
7732     /* Keep the compiler happy */
7733     c->cr0  = 0;
7734     c->bcr1 = 0;
7735
7736     /* The first dimension is equal for all cells */
7737     c->c[0][0] = comm->cell_x0[dim0];
7738     if (bDistMB)
7739     {
7740         c->bc[0] = c->c[0][0];
7741     }
7742     if (dd->ndim >= 2)
7743     {
7744         dim1 = dd->dim[1];
7745         /* This cell row is only seen from the first row */
7746         c->c[1][0] = comm->cell_x0[dim1];
7747         /* All rows can see this row */
7748         c->c[1][1] = comm->cell_x0[dim1];
7749         if (dd->bGridJump)
7750         {
7751             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7752             if (bDistMB)
7753             {
7754                 /* For the multi-body distance we need the maximum */
7755                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7756             }
7757         }
7758         /* Set the upper-right corner for rounding */
7759         c->cr0 = comm->cell_x1[dim0];
7760
7761         if (dd->ndim >= 3)
7762         {
7763             dim2 = dd->dim[2];
7764             for (j = 0; j < 4; j++)
7765             {
7766                 c->c[2][j] = comm->cell_x0[dim2];
7767             }
7768             if (dd->bGridJump)
7769             {
7770                 /* Use the maximum of the i-cells that see a j-cell */
7771                 for (i = 0; i < zones->nizone; i++)
7772                 {
7773                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7774                     {
7775                         if (j >= 4)
7776                         {
7777                             c->c[2][j-4] =
7778                                 max(c->c[2][j-4],
7779                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7780                         }
7781                     }
7782                 }
7783                 if (bDistMB)
7784                 {
7785                     /* For the multi-body distance we need the maximum */
7786                     c->bc[2] = comm->cell_x0[dim2];
7787                     for (i = 0; i < 2; i++)
7788                     {
7789                         for (j = 0; j < 2; j++)
7790                         {
7791                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7792                         }
7793                     }
7794                 }
7795             }
7796
7797             /* Set the upper-right corner for rounding */
7798             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7799              * Only cell (0,0,0) can see cell 7 (1,1,1)
7800              */
7801             c->cr1[0] = comm->cell_x1[dim1];
7802             c->cr1[3] = comm->cell_x1[dim1];
7803             if (dd->bGridJump)
7804             {
7805                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7806                 if (bDistMB)
7807                 {
7808                     /* For the multi-body distance we need the maximum */
7809                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7810                 }
7811             }
7812         }
7813     }
7814 }
7815
7816 /* Determine which cg's we need to send in this pulse from this zone */
7817 static void
7818 get_zone_pulse_cgs(gmx_domdec_t *dd,
7819                    int zonei, int zone,
7820                    int cg0, int cg1,
7821                    const int *index_gl,
7822                    const int *cgindex,
7823                    int dim, int dim_ind,
7824                    int dim0, int dim1, int dim2,
7825                    real r_comm2, real r_bcomm2,
7826                    matrix box,
7827                    ivec tric_dist,
7828                    rvec *normal,
7829                    real skew_fac2_d, real skew_fac_01,
7830                    rvec *v_d, rvec *v_0, rvec *v_1,
7831                    const dd_corners_t *c,
7832                    rvec sf2_round,
7833                    gmx_bool bDistBonded,
7834                    gmx_bool bBondComm,
7835                    gmx_bool bDist2B,
7836                    gmx_bool bDistMB,
7837                    rvec *cg_cm,
7838                    int *cginfo,
7839                    gmx_domdec_ind_t *ind,
7840                    int **ibuf, int *ibuf_nalloc,
7841                    vec_rvec_t *vbuf,
7842                    int *nsend_ptr,
7843                    int *nat_ptr,
7844                    int *nsend_z_ptr)
7845 {
7846     gmx_domdec_comm_t *comm;
7847     gmx_bool           bScrew;
7848     gmx_bool           bDistMB_pulse;
7849     int                cg, i;
7850     real               r2, rb2, r, tric_sh;
7851     rvec               rn, rb;
7852     int                dimd;
7853     int                nsend_z, nsend, nat;
7854
7855     comm = dd->comm;
7856
7857     bScrew = (dd->bScrewPBC && dim == XX);
7858
7859     bDistMB_pulse = (bDistMB && bDistBonded);
7860
7861     nsend_z = 0;
7862     nsend   = *nsend_ptr;
7863     nat     = *nat_ptr;
7864
7865     for (cg = cg0; cg < cg1; cg++)
7866     {
7867         r2  = 0;
7868         rb2 = 0;
7869         if (tric_dist[dim_ind] == 0)
7870         {
7871             /* Rectangular direction, easy */
7872             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7873             if (r > 0)
7874             {
7875                 r2 += r*r;
7876             }
7877             if (bDistMB_pulse)
7878             {
7879                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7880                 if (r > 0)
7881                 {
7882                     rb2 += r*r;
7883                 }
7884             }
7885             /* Rounding gives at most a 16% reduction
7886              * in communicated atoms
7887              */
7888             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7889             {
7890                 r = cg_cm[cg][dim0] - c->cr0;
7891                 /* This is the first dimension, so always r >= 0 */
7892                 r2 += r*r;
7893                 if (bDistMB_pulse)
7894                 {
7895                     rb2 += r*r;
7896                 }
7897             }
7898             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7899             {
7900                 r = cg_cm[cg][dim1] - c->cr1[zone];
7901                 if (r > 0)
7902                 {
7903                     r2 += r*r;
7904                 }
7905                 if (bDistMB_pulse)
7906                 {
7907                     r = cg_cm[cg][dim1] - c->bcr1;
7908                     if (r > 0)
7909                     {
7910                         rb2 += r*r;
7911                     }
7912                 }
7913             }
7914         }
7915         else
7916         {
7917             /* Triclinic direction, more complicated */
7918             clear_rvec(rn);
7919             clear_rvec(rb);
7920             /* Rounding, conservative as the skew_fac multiplication
7921              * will slightly underestimate the distance.
7922              */
7923             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7924             {
7925                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7926                 for (i = dim0+1; i < DIM; i++)
7927                 {
7928                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7929                 }
7930                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7931                 if (bDistMB_pulse)
7932                 {
7933                     rb[dim0] = rn[dim0];
7934                     rb2      = r2;
7935                 }
7936                 /* Take care that the cell planes along dim0 might not
7937                  * be orthogonal to those along dim1 and dim2.
7938                  */
7939                 for (i = 1; i <= dim_ind; i++)
7940                 {
7941                     dimd = dd->dim[i];
7942                     if (normal[dim0][dimd] > 0)
7943                     {
7944                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7945                         if (bDistMB_pulse)
7946                         {
7947                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7948                         }
7949                     }
7950                 }
7951             }
7952             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7953             {
7954                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7955                 tric_sh   = 0;
7956                 for (i = dim1+1; i < DIM; i++)
7957                 {
7958                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7959                 }
7960                 rn[dim1] += tric_sh;
7961                 if (rn[dim1] > 0)
7962                 {
7963                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7964                     /* Take care of coupling of the distances
7965                      * to the planes along dim0 and dim1 through dim2.
7966                      */
7967                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7968                     /* Take care that the cell planes along dim1
7969                      * might not be orthogonal to that along dim2.
7970                      */
7971                     if (normal[dim1][dim2] > 0)
7972                     {
7973                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7974                     }
7975                 }
7976                 if (bDistMB_pulse)
7977                 {
7978                     rb[dim1] +=
7979                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7980                     if (rb[dim1] > 0)
7981                     {
7982                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7983                         /* Take care of coupling of the distances
7984                          * to the planes along dim0 and dim1 through dim2.
7985                          */
7986                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7987                         /* Take care that the cell planes along dim1
7988                          * might not be orthogonal to that along dim2.
7989                          */
7990                         if (normal[dim1][dim2] > 0)
7991                         {
7992                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7993                         }
7994                     }
7995                 }
7996             }
7997             /* The distance along the communication direction */
7998             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7999             tric_sh  = 0;
8000             for (i = dim+1; i < DIM; i++)
8001             {
8002                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
8003             }
8004             rn[dim] += tric_sh;
8005             if (rn[dim] > 0)
8006             {
8007                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
8008                 /* Take care of coupling of the distances
8009                  * to the planes along dim0 and dim1 through dim2.
8010                  */
8011                 if (dim_ind == 1 && zonei == 1)
8012                 {
8013                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8014                 }
8015             }
8016             if (bDistMB_pulse)
8017             {
8018                 clear_rvec(rb);
8019                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8020                 if (rb[dim] > 0)
8021                 {
8022                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8023                     /* Take care of coupling of the distances
8024                      * to the planes along dim0 and dim1 through dim2.
8025                      */
8026                     if (dim_ind == 1 && zonei == 1)
8027                     {
8028                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8029                     }
8030                 }
8031             }
8032         }
8033
8034         if (r2 < r_comm2 ||
8035             (bDistBonded &&
8036              ((bDistMB && rb2 < r_bcomm2) ||
8037               (bDist2B && r2  < r_bcomm2)) &&
8038              (!bBondComm ||
8039               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8040                missing_link(comm->cglink, index_gl[cg],
8041                             comm->bLocalCG)))))
8042         {
8043             /* Make an index to the local charge groups */
8044             if (nsend+1 > ind->nalloc)
8045             {
8046                 ind->nalloc = over_alloc_large(nsend+1);
8047                 srenew(ind->index, ind->nalloc);
8048             }
8049             if (nsend+1 > *ibuf_nalloc)
8050             {
8051                 *ibuf_nalloc = over_alloc_large(nsend+1);
8052                 srenew(*ibuf, *ibuf_nalloc);
8053             }
8054             ind->index[nsend] = cg;
8055             (*ibuf)[nsend]    = index_gl[cg];
8056             nsend_z++;
8057             vec_rvec_check_alloc(vbuf, nsend+1);
8058
8059             if (dd->ci[dim] == 0)
8060             {
8061                 /* Correct cg_cm for pbc */
8062                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8063                 if (bScrew)
8064                 {
8065                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8066                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8067                 }
8068             }
8069             else
8070             {
8071                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8072             }
8073             nsend++;
8074             nat += cgindex[cg+1] - cgindex[cg];
8075         }
8076     }
8077
8078     *nsend_ptr   = nsend;
8079     *nat_ptr     = nat;
8080     *nsend_z_ptr = nsend_z;
8081 }
8082
8083 static void setup_dd_communication(gmx_domdec_t *dd,
8084                                    matrix box, gmx_ddbox_t *ddbox,
8085                                    t_forcerec *fr, t_state *state, rvec **f)
8086 {
8087     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8088     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8089     int                    c, i, j, cg, cg_gl, nrcg;
8090     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8091     gmx_domdec_comm_t     *comm;
8092     gmx_domdec_zones_t    *zones;
8093     gmx_domdec_comm_dim_t *cd;
8094     gmx_domdec_ind_t      *ind;
8095     cginfo_mb_t           *cginfo_mb;
8096     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8097     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
8098     dd_corners_t           corners;
8099     ivec                   tric_dist;
8100     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8101     real                   skew_fac2_d, skew_fac_01;
8102     rvec                   sf2_round;
8103     int                    nsend, nat;
8104     int                    th;
8105
8106     if (debug)
8107     {
8108         fprintf(debug, "Setting up DD communication\n");
8109     }
8110
8111     comm  = dd->comm;
8112
8113     switch (fr->cutoff_scheme)
8114     {
8115         case ecutsGROUP:
8116             cg_cm = fr->cg_cm;
8117             break;
8118         case ecutsVERLET:
8119             cg_cm = state->x;
8120             break;
8121         default:
8122             gmx_incons("unimplemented");
8123             cg_cm = NULL;
8124     }
8125
8126     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8127     {
8128         dim = dd->dim[dim_ind];
8129
8130         /* Check if we need to use triclinic distances */
8131         tric_dist[dim_ind] = 0;
8132         for (i = 0; i <= dim_ind; i++)
8133         {
8134             if (ddbox->tric_dir[dd->dim[i]])
8135             {
8136                 tric_dist[dim_ind] = 1;
8137             }
8138         }
8139     }
8140
8141     bBondComm = comm->bBondComm;
8142
8143     /* Do we need to determine extra distances for multi-body bondeds? */
8144     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8145
8146     /* Do we need to determine extra distances for only two-body bondeds? */
8147     bDist2B = (bBondComm && !bDistMB);
8148
8149     r_comm2  = sqr(comm->cutoff);
8150     r_bcomm2 = sqr(comm->cutoff_mbody);
8151
8152     if (debug)
8153     {
8154         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8155     }
8156
8157     zones = &comm->zones;
8158
8159     dim0 = dd->dim[0];
8160     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8161     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8162
8163     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8164
8165     /* Triclinic stuff */
8166     normal      = ddbox->normal;
8167     skew_fac_01 = 0;
8168     if (dd->ndim >= 2)
8169     {
8170         v_0 = ddbox->v[dim0];
8171         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8172         {
8173             /* Determine the coupling coefficient for the distances
8174              * to the cell planes along dim0 and dim1 through dim2.
8175              * This is required for correct rounding.
8176              */
8177             skew_fac_01 =
8178                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8179             if (debug)
8180             {
8181                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8182             }
8183         }
8184     }
8185     if (dd->ndim >= 3)
8186     {
8187         v_1 = ddbox->v[dim1];
8188     }
8189
8190     zone_cg_range = zones->cg_range;
8191     index_gl      = dd->index_gl;
8192     cgindex       = dd->cgindex;
8193     cginfo_mb     = fr->cginfo_mb;
8194
8195     zone_cg_range[0]   = 0;
8196     zone_cg_range[1]   = dd->ncg_home;
8197     comm->zone_ncg1[0] = dd->ncg_home;
8198     pos_cg             = dd->ncg_home;
8199
8200     nat_tot = dd->nat_home;
8201     nzone   = 1;
8202     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8203     {
8204         dim = dd->dim[dim_ind];
8205         cd  = &comm->cd[dim_ind];
8206
8207         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8208         {
8209             /* No pbc in this dimension, the first node should not comm. */
8210             nzone_send = 0;
8211         }
8212         else
8213         {
8214             nzone_send = nzone;
8215         }
8216
8217         v_d         = ddbox->v[dim];
8218         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8219
8220         cd->bInPlace = TRUE;
8221         for (p = 0; p < cd->np; p++)
8222         {
8223             /* Only atoms communicated in the first pulse are used
8224              * for multi-body bonded interactions or for bBondComm.
8225              */
8226             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8227
8228             ind   = &cd->ind[p];
8229             nsend = 0;
8230             nat   = 0;
8231             for (zone = 0; zone < nzone_send; zone++)
8232             {
8233                 if (tric_dist[dim_ind] && dim_ind > 0)
8234                 {
8235                     /* Determine slightly more optimized skew_fac's
8236                      * for rounding.
8237                      * This reduces the number of communicated atoms
8238                      * by about 10% for 3D DD of rhombic dodecahedra.
8239                      */
8240                     for (dimd = 0; dimd < dim; dimd++)
8241                     {
8242                         sf2_round[dimd] = 1;
8243                         if (ddbox->tric_dir[dimd])
8244                         {
8245                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8246                             {
8247                                 /* If we are shifted in dimension i
8248                                  * and the cell plane is tilted forward
8249                                  * in dimension i, skip this coupling.
8250                                  */
8251                                 if (!(zones->shift[nzone+zone][i] &&
8252                                       ddbox->v[dimd][i][dimd] >= 0))
8253                                 {
8254                                     sf2_round[dimd] +=
8255                                         sqr(ddbox->v[dimd][i][dimd]);
8256                                 }
8257                             }
8258                             sf2_round[dimd] = 1/sf2_round[dimd];
8259                         }
8260                     }
8261                 }
8262
8263                 zonei = zone_perm[dim_ind][zone];
8264                 if (p == 0)
8265                 {
8266                     /* Here we permutate the zones to obtain a convenient order
8267                      * for neighbor searching
8268                      */
8269                     cg0 = zone_cg_range[zonei];
8270                     cg1 = zone_cg_range[zonei+1];
8271                 }
8272                 else
8273                 {
8274                     /* Look only at the cg's received in the previous grid pulse
8275                      */
8276                     cg1 = zone_cg_range[nzone+zone+1];
8277                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8278                 }
8279
8280 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8281                 for (th = 0; th < comm->nth; th++)
8282                 {
8283                     gmx_domdec_ind_t *ind_p;
8284                     int             **ibuf_p, *ibuf_nalloc_p;
8285                     vec_rvec_t       *vbuf_p;
8286                     int              *nsend_p, *nat_p;
8287                     int              *nsend_zone_p;
8288                     int               cg0_th, cg1_th;
8289
8290                     if (th == 0)
8291                     {
8292                         /* Thread 0 writes in the comm buffers */
8293                         ind_p         = ind;
8294                         ibuf_p        = &comm->buf_int;
8295                         ibuf_nalloc_p = &comm->nalloc_int;
8296                         vbuf_p        = &comm->vbuf;
8297                         nsend_p       = &nsend;
8298                         nat_p         = &nat;
8299                         nsend_zone_p  = &ind->nsend[zone];
8300                     }
8301                     else
8302                     {
8303                         /* Other threads write into temp buffers */
8304                         ind_p         = &comm->dth[th].ind;
8305                         ibuf_p        = &comm->dth[th].ibuf;
8306                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8307                         vbuf_p        = &comm->dth[th].vbuf;
8308                         nsend_p       = &comm->dth[th].nsend;
8309                         nat_p         = &comm->dth[th].nat;
8310                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8311
8312                         comm->dth[th].nsend      = 0;
8313                         comm->dth[th].nat        = 0;
8314                         comm->dth[th].nsend_zone = 0;
8315                     }
8316
8317                     if (comm->nth == 1)
8318                     {
8319                         cg0_th = cg0;
8320                         cg1_th = cg1;
8321                     }
8322                     else
8323                     {
8324                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8325                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8326                     }
8327
8328                     /* Get the cg's for this pulse in this zone */
8329                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8330                                        index_gl, cgindex,
8331                                        dim, dim_ind, dim0, dim1, dim2,
8332                                        r_comm2, r_bcomm2,
8333                                        box, tric_dist,
8334                                        normal, skew_fac2_d, skew_fac_01,
8335                                        v_d, v_0, v_1, &corners, sf2_round,
8336                                        bDistBonded, bBondComm,
8337                                        bDist2B, bDistMB,
8338                                        cg_cm, fr->cginfo,
8339                                        ind_p,
8340                                        ibuf_p, ibuf_nalloc_p,
8341                                        vbuf_p,
8342                                        nsend_p, nat_p,
8343                                        nsend_zone_p);
8344                 }
8345
8346                 /* Append data of threads>=1 to the communication buffers */
8347                 for (th = 1; th < comm->nth; th++)
8348                 {
8349                     dd_comm_setup_work_t *dth;
8350                     int                   i, ns1;
8351
8352                     dth = &comm->dth[th];
8353
8354                     ns1 = nsend + dth->nsend_zone;
8355                     if (ns1 > ind->nalloc)
8356                     {
8357                         ind->nalloc = over_alloc_dd(ns1);
8358                         srenew(ind->index, ind->nalloc);
8359                     }
8360                     if (ns1 > comm->nalloc_int)
8361                     {
8362                         comm->nalloc_int = over_alloc_dd(ns1);
8363                         srenew(comm->buf_int, comm->nalloc_int);
8364                     }
8365                     if (ns1 > comm->vbuf.nalloc)
8366                     {
8367                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8368                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8369                     }
8370
8371                     for (i = 0; i < dth->nsend_zone; i++)
8372                     {
8373                         ind->index[nsend]    = dth->ind.index[i];
8374                         comm->buf_int[nsend] = dth->ibuf[i];
8375                         copy_rvec(dth->vbuf.v[i],
8376                                   comm->vbuf.v[nsend]);
8377                         nsend++;
8378                     }
8379                     nat              += dth->nat;
8380                     ind->nsend[zone] += dth->nsend_zone;
8381                 }
8382             }
8383             /* Clear the counts in case we do not have pbc */
8384             for (zone = nzone_send; zone < nzone; zone++)
8385             {
8386                 ind->nsend[zone] = 0;
8387             }
8388             ind->nsend[nzone]   = nsend;
8389             ind->nsend[nzone+1] = nat;
8390             /* Communicate the number of cg's and atoms to receive */
8391             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8392                             ind->nsend, nzone+2,
8393                             ind->nrecv, nzone+2);
8394
8395             /* The rvec buffer is also required for atom buffers of size nsend
8396              * in dd_move_x and dd_move_f.
8397              */
8398             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8399
8400             if (p > 0)
8401             {
8402                 /* We can receive in place if only the last zone is not empty */
8403                 for (zone = 0; zone < nzone-1; zone++)
8404                 {
8405                     if (ind->nrecv[zone] > 0)
8406                     {
8407                         cd->bInPlace = FALSE;
8408                     }
8409                 }
8410                 if (!cd->bInPlace)
8411                 {
8412                     /* The int buffer is only required here for the cg indices */
8413                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8414                     {
8415                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8416                         srenew(comm->buf_int2, comm->nalloc_int2);
8417                     }
8418                     /* The rvec buffer is also required for atom buffers
8419                      * of size nrecv in dd_move_x and dd_move_f.
8420                      */
8421                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8422                     vec_rvec_check_alloc(&comm->vbuf2, i);
8423                 }
8424             }
8425
8426             /* Make space for the global cg indices */
8427             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8428                 || dd->cg_nalloc == 0)
8429             {
8430                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8431                 srenew(index_gl, dd->cg_nalloc);
8432                 srenew(cgindex, dd->cg_nalloc+1);
8433             }
8434             /* Communicate the global cg indices */
8435             if (cd->bInPlace)
8436             {
8437                 recv_i = index_gl + pos_cg;
8438             }
8439             else
8440             {
8441                 recv_i = comm->buf_int2;
8442             }
8443             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8444                             comm->buf_int, nsend,
8445                             recv_i,        ind->nrecv[nzone]);
8446
8447             /* Make space for cg_cm */
8448             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8449             if (fr->cutoff_scheme == ecutsGROUP)
8450             {
8451                 cg_cm = fr->cg_cm;
8452             }
8453             else
8454             {
8455                 cg_cm = state->x;
8456             }
8457             /* Communicate cg_cm */
8458             if (cd->bInPlace)
8459             {
8460                 recv_vr = cg_cm + pos_cg;
8461             }
8462             else
8463             {
8464                 recv_vr = comm->vbuf2.v;
8465             }
8466             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8467                              comm->vbuf.v, nsend,
8468                              recv_vr,      ind->nrecv[nzone]);
8469
8470             /* Make the charge group index */
8471             if (cd->bInPlace)
8472             {
8473                 zone = (p == 0 ? 0 : nzone - 1);
8474                 while (zone < nzone)
8475                 {
8476                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8477                     {
8478                         cg_gl              = index_gl[pos_cg];
8479                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8480                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8481                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8482                         if (bBondComm)
8483                         {
8484                             /* Update the charge group presence,
8485                              * so we can use it in the next pass of the loop.
8486                              */
8487                             comm->bLocalCG[cg_gl] = TRUE;
8488                         }
8489                         pos_cg++;
8490                     }
8491                     if (p == 0)
8492                     {
8493                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8494                     }
8495                     zone++;
8496                     zone_cg_range[nzone+zone] = pos_cg;
8497                 }
8498             }
8499             else
8500             {
8501                 /* This part of the code is never executed with bBondComm. */
8502                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8503                                  index_gl, recv_i, cg_cm, recv_vr,
8504                                  cgindex, fr->cginfo_mb, fr->cginfo);
8505                 pos_cg += ind->nrecv[nzone];
8506             }
8507             nat_tot += ind->nrecv[nzone+1];
8508         }
8509         if (!cd->bInPlace)
8510         {
8511             /* Store the atom block for easy copying of communication buffers */
8512             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8513         }
8514         nzone += nzone;
8515     }
8516     dd->index_gl = index_gl;
8517     dd->cgindex  = cgindex;
8518
8519     dd->ncg_tot          = zone_cg_range[zones->n];
8520     dd->nat_tot          = nat_tot;
8521     comm->nat[ddnatHOME] = dd->nat_home;
8522     for (i = ddnatZONE; i < ddnatNR; i++)
8523     {
8524         comm->nat[i] = dd->nat_tot;
8525     }
8526
8527     if (!bBondComm)
8528     {
8529         /* We don't need to update cginfo, since that was alrady done above.
8530          * So we pass NULL for the forcerec.
8531          */
8532         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8533                       NULL, comm->bLocalCG);
8534     }
8535
8536     if (debug)
8537     {
8538         fprintf(debug, "Finished setting up DD communication, zones:");
8539         for (c = 0; c < zones->n; c++)
8540         {
8541             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8542         }
8543         fprintf(debug, "\n");
8544     }
8545 }
8546
8547 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8548 {
8549     int c;
8550
8551     for (c = 0; c < zones->nizone; c++)
8552     {
8553         zones->izone[c].cg1  = zones->cg_range[c+1];
8554         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8555         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8556     }
8557 }
8558
8559 static void set_zones_size(gmx_domdec_t *dd,
8560                            matrix box, const gmx_ddbox_t *ddbox,
8561                            int zone_start, int zone_end)
8562 {
8563     gmx_domdec_comm_t  *comm;
8564     gmx_domdec_zones_t *zones;
8565     gmx_bool            bDistMB;
8566     int                 z, zi, zj0, zj1, d, dim;
8567     real                rcs, rcmbs;
8568     int                 i, j;
8569     real                size_j, add_tric;
8570     real                vol;
8571
8572     comm = dd->comm;
8573
8574     zones = &comm->zones;
8575
8576     /* Do we need to determine extra distances for multi-body bondeds? */
8577     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8578
8579     for (z = zone_start; z < zone_end; z++)
8580     {
8581         /* Copy cell limits to zone limits.
8582          * Valid for non-DD dims and non-shifted dims.
8583          */
8584         copy_rvec(comm->cell_x0, zones->size[z].x0);
8585         copy_rvec(comm->cell_x1, zones->size[z].x1);
8586     }
8587
8588     for (d = 0; d < dd->ndim; d++)
8589     {
8590         dim = dd->dim[d];
8591
8592         for (z = 0; z < zones->n; z++)
8593         {
8594             /* With a staggered grid we have different sizes
8595              * for non-shifted dimensions.
8596              */
8597             if (dd->bGridJump && zones->shift[z][dim] == 0)
8598             {
8599                 if (d == 1)
8600                 {
8601                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8602                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8603                 }
8604                 else if (d == 2)
8605                 {
8606                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8607                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8608                 }
8609             }
8610         }
8611
8612         rcs   = comm->cutoff;
8613         rcmbs = comm->cutoff_mbody;
8614         if (ddbox->tric_dir[dim])
8615         {
8616             rcs   /= ddbox->skew_fac[dim];
8617             rcmbs /= ddbox->skew_fac[dim];
8618         }
8619
8620         /* Set the lower limit for the shifted zone dimensions */
8621         for (z = zone_start; z < zone_end; z++)
8622         {
8623             if (zones->shift[z][dim] > 0)
8624             {
8625                 dim = dd->dim[d];
8626                 if (!dd->bGridJump || d == 0)
8627                 {
8628                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8629                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8630                 }
8631                 else
8632                 {
8633                     /* Here we take the lower limit of the zone from
8634                      * the lowest domain of the zone below.
8635                      */
8636                     if (z < 4)
8637                     {
8638                         zones->size[z].x0[dim] =
8639                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8640                     }
8641                     else
8642                     {
8643                         if (d == 1)
8644                         {
8645                             zones->size[z].x0[dim] =
8646                                 zones->size[zone_perm[2][z-4]].x0[dim];
8647                         }
8648                         else
8649                         {
8650                             zones->size[z].x0[dim] =
8651                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8652                         }
8653                     }
8654                     /* A temporary limit, is updated below */
8655                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8656
8657                     if (bDistMB)
8658                     {
8659                         for (zi = 0; zi < zones->nizone; zi++)
8660                         {
8661                             if (zones->shift[zi][dim] == 0)
8662                             {
8663                                 /* This takes the whole zone into account.
8664                                  * With multiple pulses this will lead
8665                                  * to a larger zone then strictly necessary.
8666                                  */
8667                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8668                                                              zones->size[zi].x1[dim]+rcmbs);
8669                             }
8670                         }
8671                     }
8672                 }
8673             }
8674         }
8675
8676         /* Loop over the i-zones to set the upper limit of each
8677          * j-zone they see.
8678          */
8679         for (zi = 0; zi < zones->nizone; zi++)
8680         {
8681             if (zones->shift[zi][dim] == 0)
8682             {
8683                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8684                 {
8685                     if (zones->shift[z][dim] > 0)
8686                     {
8687                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8688                                                      zones->size[zi].x1[dim]+rcs);
8689                     }
8690                 }
8691             }
8692         }
8693     }
8694
8695     for (z = zone_start; z < zone_end; z++)
8696     {
8697         /* Initialization only required to keep the compiler happy */
8698         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8699         int  nc, c;
8700
8701         /* To determine the bounding box for a zone we need to find
8702          * the extreme corners of 4, 2 or 1 corners.
8703          */
8704         nc = 1 << (ddbox->npbcdim - 1);
8705
8706         for (c = 0; c < nc; c++)
8707         {
8708             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8709             corner[XX] = 0;
8710             if ((c & 1) == 0)
8711             {
8712                 corner[YY] = zones->size[z].x0[YY];
8713             }
8714             else
8715             {
8716                 corner[YY] = zones->size[z].x1[YY];
8717             }
8718             if ((c & 2) == 0)
8719             {
8720                 corner[ZZ] = zones->size[z].x0[ZZ];
8721             }
8722             else
8723             {
8724                 corner[ZZ] = zones->size[z].x1[ZZ];
8725             }
8726             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8727             {
8728                 /* With 1D domain decomposition the cg's are not in
8729                  * the triclinic box, but triclinic x-y and rectangular y-z.
8730                  * Shift y back, so it will later end up at 0.
8731                  */
8732                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8733             }
8734             /* Apply the triclinic couplings */
8735             assert(ddbox->npbcdim <= DIM);
8736             for (i = YY; i < ddbox->npbcdim; i++)
8737             {
8738                 for (j = XX; j < i; j++)
8739                 {
8740                     corner[j] += corner[i]*box[i][j]/box[i][i];
8741                 }
8742             }
8743             if (c == 0)
8744             {
8745                 copy_rvec(corner, corner_min);
8746                 copy_rvec(corner, corner_max);
8747             }
8748             else
8749             {
8750                 for (i = 0; i < DIM; i++)
8751                 {
8752                     corner_min[i] = min(corner_min[i], corner[i]);
8753                     corner_max[i] = max(corner_max[i], corner[i]);
8754                 }
8755             }
8756         }
8757         /* Copy the extreme cornes without offset along x */
8758         for (i = 0; i < DIM; i++)
8759         {
8760             zones->size[z].bb_x0[i] = corner_min[i];
8761             zones->size[z].bb_x1[i] = corner_max[i];
8762         }
8763         /* Add the offset along x */
8764         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8765         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8766     }
8767
8768     if (zone_start == 0)
8769     {
8770         vol = 1;
8771         for (dim = 0; dim < DIM; dim++)
8772         {
8773             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8774         }
8775         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8776     }
8777
8778     if (debug)
8779     {
8780         for (z = zone_start; z < zone_end; z++)
8781         {
8782             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8783                     z,
8784                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8785                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8786                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8787             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8788                     z,
8789                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8790                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8791                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8792         }
8793     }
8794 }
8795
8796 static int comp_cgsort(const void *a, const void *b)
8797 {
8798     int           comp;
8799
8800     gmx_cgsort_t *cga, *cgb;
8801     cga = (gmx_cgsort_t *)a;
8802     cgb = (gmx_cgsort_t *)b;
8803
8804     comp = cga->nsc - cgb->nsc;
8805     if (comp == 0)
8806     {
8807         comp = cga->ind_gl - cgb->ind_gl;
8808     }
8809
8810     return comp;
8811 }
8812
8813 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8814                          int *a, int *buf)
8815 {
8816     int i;
8817
8818     /* Order the data */
8819     for (i = 0; i < n; i++)
8820     {
8821         buf[i] = a[sort[i].ind];
8822     }
8823
8824     /* Copy back to the original array */
8825     for (i = 0; i < n; i++)
8826     {
8827         a[i] = buf[i];
8828     }
8829 }
8830
8831 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8832                          rvec *v, rvec *buf)
8833 {
8834     int i;
8835
8836     /* Order the data */
8837     for (i = 0; i < n; i++)
8838     {
8839         copy_rvec(v[sort[i].ind], buf[i]);
8840     }
8841
8842     /* Copy back to the original array */
8843     for (i = 0; i < n; i++)
8844     {
8845         copy_rvec(buf[i], v[i]);
8846     }
8847 }
8848
8849 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8850                            rvec *v, rvec *buf)
8851 {
8852     int a, atot, cg, cg0, cg1, i;
8853
8854     if (cgindex == NULL)
8855     {
8856         /* Avoid the useless loop of the atoms within a cg */
8857         order_vec_cg(ncg, sort, v, buf);
8858
8859         return;
8860     }
8861
8862     /* Order the data */
8863     a = 0;
8864     for (cg = 0; cg < ncg; cg++)
8865     {
8866         cg0 = cgindex[sort[cg].ind];
8867         cg1 = cgindex[sort[cg].ind+1];
8868         for (i = cg0; i < cg1; i++)
8869         {
8870             copy_rvec(v[i], buf[a]);
8871             a++;
8872         }
8873     }
8874     atot = a;
8875
8876     /* Copy back to the original array */
8877     for (a = 0; a < atot; a++)
8878     {
8879         copy_rvec(buf[a], v[a]);
8880     }
8881 }
8882
8883 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8884                          int nsort_new, gmx_cgsort_t *sort_new,
8885                          gmx_cgsort_t *sort1)
8886 {
8887     int i1, i2, i_new;
8888
8889     /* The new indices are not very ordered, so we qsort them */
8890     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8891
8892     /* sort2 is already ordered, so now we can merge the two arrays */
8893     i1    = 0;
8894     i2    = 0;
8895     i_new = 0;
8896     while (i2 < nsort2 || i_new < nsort_new)
8897     {
8898         if (i2 == nsort2)
8899         {
8900             sort1[i1++] = sort_new[i_new++];
8901         }
8902         else if (i_new == nsort_new)
8903         {
8904             sort1[i1++] = sort2[i2++];
8905         }
8906         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8907                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8908                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8909         {
8910             sort1[i1++] = sort2[i2++];
8911         }
8912         else
8913         {
8914             sort1[i1++] = sort_new[i_new++];
8915         }
8916     }
8917 }
8918
8919 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8920 {
8921     gmx_domdec_sort_t *sort;
8922     gmx_cgsort_t      *cgsort, *sort_i;
8923     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8924     int                sort_last, sort_skip;
8925
8926     sort = dd->comm->sort;
8927
8928     a = fr->ns.grid->cell_index;
8929
8930     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8931
8932     if (ncg_home_old >= 0)
8933     {
8934         /* The charge groups that remained in the same ns grid cell
8935          * are completely ordered. So we can sort efficiently by sorting
8936          * the charge groups that did move into the stationary list.
8937          */
8938         ncg_new   = 0;
8939         nsort2    = 0;
8940         nsort_new = 0;
8941         for (i = 0; i < dd->ncg_home; i++)
8942         {
8943             /* Check if this cg did not move to another node */
8944             if (a[i] < moved)
8945             {
8946                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8947                 {
8948                     /* This cg is new on this node or moved ns grid cell */
8949                     if (nsort_new >= sort->sort_new_nalloc)
8950                     {
8951                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8952                         srenew(sort->sort_new, sort->sort_new_nalloc);
8953                     }
8954                     sort_i = &(sort->sort_new[nsort_new++]);
8955                 }
8956                 else
8957                 {
8958                     /* This cg did not move */
8959                     sort_i = &(sort->sort2[nsort2++]);
8960                 }
8961                 /* Sort on the ns grid cell indices
8962                  * and the global topology index.
8963                  * index_gl is irrelevant with cell ns,
8964                  * but we set it here anyhow to avoid a conditional.
8965                  */
8966                 sort_i->nsc    = a[i];
8967                 sort_i->ind_gl = dd->index_gl[i];
8968                 sort_i->ind    = i;
8969                 ncg_new++;
8970             }
8971         }
8972         if (debug)
8973         {
8974             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8975                     nsort2, nsort_new);
8976         }
8977         /* Sort efficiently */
8978         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8979                      sort->sort);
8980     }
8981     else
8982     {
8983         cgsort  = sort->sort;
8984         ncg_new = 0;
8985         for (i = 0; i < dd->ncg_home; i++)
8986         {
8987             /* Sort on the ns grid cell indices
8988              * and the global topology index
8989              */
8990             cgsort[i].nsc    = a[i];
8991             cgsort[i].ind_gl = dd->index_gl[i];
8992             cgsort[i].ind    = i;
8993             if (cgsort[i].nsc < moved)
8994             {
8995                 ncg_new++;
8996             }
8997         }
8998         if (debug)
8999         {
9000             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
9001         }
9002         /* Determine the order of the charge groups using qsort */
9003         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
9004     }
9005
9006     return ncg_new;
9007 }
9008
9009 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9010 {
9011     gmx_cgsort_t *sort;
9012     int           ncg_new, i, *a, na;
9013
9014     sort = dd->comm->sort->sort;
9015
9016     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9017
9018     ncg_new = 0;
9019     for (i = 0; i < na; i++)
9020     {
9021         if (a[i] >= 0)
9022         {
9023             sort[ncg_new].ind = a[i];
9024             ncg_new++;
9025         }
9026     }
9027
9028     return ncg_new;
9029 }
9030
9031 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
9032                           int ncg_home_old)
9033 {
9034     gmx_domdec_sort_t *sort;
9035     gmx_cgsort_t      *cgsort, *sort_i;
9036     int               *cgindex;
9037     int                ncg_new, i, *ibuf, cgsize;
9038     rvec              *vbuf;
9039
9040     sort = dd->comm->sort;
9041
9042     if (dd->ncg_home > sort->sort_nalloc)
9043     {
9044         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9045         srenew(sort->sort, sort->sort_nalloc);
9046         srenew(sort->sort2, sort->sort_nalloc);
9047     }
9048     cgsort = sort->sort;
9049
9050     switch (fr->cutoff_scheme)
9051     {
9052         case ecutsGROUP:
9053             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9054             break;
9055         case ecutsVERLET:
9056             ncg_new = dd_sort_order_nbnxn(dd, fr);
9057             break;
9058         default:
9059             gmx_incons("unimplemented");
9060             ncg_new = 0;
9061     }
9062
9063     /* We alloc with the old size, since cgindex is still old */
9064     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9065     vbuf = dd->comm->vbuf.v;
9066
9067     if (dd->comm->bCGs)
9068     {
9069         cgindex = dd->cgindex;
9070     }
9071     else
9072     {
9073         cgindex = NULL;
9074     }
9075
9076     /* Remove the charge groups which are no longer at home here */
9077     dd->ncg_home = ncg_new;
9078     if (debug)
9079     {
9080         fprintf(debug, "Set the new home charge group count to %d\n",
9081                 dd->ncg_home);
9082     }
9083
9084     /* Reorder the state */
9085     for (i = 0; i < estNR; i++)
9086     {
9087         if (EST_DISTR(i) && (state->flags & (1<<i)))
9088         {
9089             switch (i)
9090             {
9091                 case estX:
9092                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9093                     break;
9094                 case estV:
9095                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9096                     break;
9097                 case estSDX:
9098                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9099                     break;
9100                 case estCGP:
9101                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9102                     break;
9103                 case estLD_RNG:
9104                 case estLD_RNGI:
9105                 case estDISRE_INITF:
9106                 case estDISRE_RM3TAV:
9107                 case estORIRE_INITF:
9108                 case estORIRE_DTAV:
9109                     /* No ordering required */
9110                     break;
9111                 default:
9112                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9113                     break;
9114             }
9115         }
9116     }
9117     if (fr->cutoff_scheme == ecutsGROUP)
9118     {
9119         /* Reorder cgcm */
9120         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9121     }
9122
9123     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9124     {
9125         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9126         srenew(sort->ibuf, sort->ibuf_nalloc);
9127     }
9128     ibuf = sort->ibuf;
9129     /* Reorder the global cg index */
9130     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9131     /* Reorder the cginfo */
9132     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9133     /* Rebuild the local cg index */
9134     if (dd->comm->bCGs)
9135     {
9136         ibuf[0] = 0;
9137         for (i = 0; i < dd->ncg_home; i++)
9138         {
9139             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9140             ibuf[i+1] = ibuf[i] + cgsize;
9141         }
9142         for (i = 0; i < dd->ncg_home+1; i++)
9143         {
9144             dd->cgindex[i] = ibuf[i];
9145         }
9146     }
9147     else
9148     {
9149         for (i = 0; i < dd->ncg_home+1; i++)
9150         {
9151             dd->cgindex[i] = i;
9152         }
9153     }
9154     /* Set the home atom number */
9155     dd->nat_home = dd->cgindex[dd->ncg_home];
9156
9157     if (fr->cutoff_scheme == ecutsVERLET)
9158     {
9159         /* The atoms are now exactly in grid order, update the grid order */
9160         nbnxn_set_atomorder(fr->nbv->nbs);
9161     }
9162     else
9163     {
9164         /* Copy the sorted ns cell indices back to the ns grid struct */
9165         for (i = 0; i < dd->ncg_home; i++)
9166         {
9167             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9168         }
9169         fr->ns.grid->nr = dd->ncg_home;
9170     }
9171 }
9172
9173 static void add_dd_statistics(gmx_domdec_t *dd)
9174 {
9175     gmx_domdec_comm_t *comm;
9176     int                ddnat;
9177
9178     comm = dd->comm;
9179
9180     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9181     {
9182         comm->sum_nat[ddnat-ddnatZONE] +=
9183             comm->nat[ddnat] - comm->nat[ddnat-1];
9184     }
9185     comm->ndecomp++;
9186 }
9187
9188 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9189 {
9190     gmx_domdec_comm_t *comm;
9191     int                ddnat;
9192
9193     comm = dd->comm;
9194
9195     /* Reset all the statistics and counters for total run counting */
9196     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9197     {
9198         comm->sum_nat[ddnat-ddnatZONE] = 0;
9199     }
9200     comm->ndecomp   = 0;
9201     comm->nload     = 0;
9202     comm->load_step = 0;
9203     comm->load_sum  = 0;
9204     comm->load_max  = 0;
9205     clear_ivec(comm->load_lim);
9206     comm->load_mdf = 0;
9207     comm->load_pme = 0;
9208 }
9209
9210 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9211 {
9212     gmx_domdec_comm_t *comm;
9213     int                ddnat;
9214     double             av;
9215
9216     comm = cr->dd->comm;
9217
9218     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9219
9220     if (fplog == NULL)
9221     {
9222         return;
9223     }
9224
9225     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9226
9227     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9228     {
9229         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9230         switch (ddnat)
9231         {
9232             case ddnatZONE:
9233                 fprintf(fplog,
9234                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9235                         2, av);
9236                 break;
9237             case ddnatVSITE:
9238                 if (cr->dd->vsite_comm)
9239                 {
9240                     fprintf(fplog,
9241                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9242                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9243                             av);
9244                 }
9245                 break;
9246             case ddnatCON:
9247                 if (cr->dd->constraint_comm)
9248                 {
9249                     fprintf(fplog,
9250                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9251                             1 + ir->nLincsIter, av);
9252                 }
9253                 break;
9254             default:
9255                 gmx_incons(" Unknown type for DD statistics");
9256         }
9257     }
9258     fprintf(fplog, "\n");
9259
9260     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9261     {
9262         print_dd_load_av(fplog, cr->dd);
9263     }
9264 }
9265
9266 void dd_partition_system(FILE                *fplog,
9267                          gmx_int64_t          step,
9268                          t_commrec           *cr,
9269                          gmx_bool             bMasterState,
9270                          int                  nstglobalcomm,
9271                          t_state             *state_global,
9272                          gmx_mtop_t          *top_global,
9273                          t_inputrec          *ir,
9274                          t_state             *state_local,
9275                          rvec               **f,
9276                          t_mdatoms           *mdatoms,
9277                          gmx_localtop_t      *top_local,
9278                          t_forcerec          *fr,
9279                          gmx_vsite_t         *vsite,
9280                          gmx_shellfc_t        shellfc,
9281                          gmx_constr_t         constr,
9282                          t_nrnb              *nrnb,
9283                          gmx_wallcycle_t      wcycle,
9284                          gmx_bool             bVerbose)
9285 {
9286     gmx_domdec_t      *dd;
9287     gmx_domdec_comm_t *comm;
9288     gmx_ddbox_t        ddbox = {0};
9289     t_block           *cgs_gl;
9290     gmx_int64_t        step_pcoupl;
9291     rvec               cell_ns_x0, cell_ns_x1;
9292     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9293     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9294     gmx_bool           bRedist, bSortCG, bResortAll;
9295     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9296     real               grid_density;
9297     char               sbuf[22];
9298
9299     dd   = cr->dd;
9300     comm = dd->comm;
9301
9302     bBoxChanged = (bMasterState || DEFORM(*ir));
9303     if (ir->epc != epcNO)
9304     {
9305         /* With nstpcouple > 1 pressure coupling happens.
9306          * one step after calculating the pressure.
9307          * Box scaling happens at the end of the MD step,
9308          * after the DD partitioning.
9309          * We therefore have to do DLB in the first partitioning
9310          * after an MD step where P-coupling occured.
9311          * We need to determine the last step in which p-coupling occurred.
9312          * MRS -- need to validate this for vv?
9313          */
9314         n = ir->nstpcouple;
9315         if (n == 1)
9316         {
9317             step_pcoupl = step - 1;
9318         }
9319         else
9320         {
9321             step_pcoupl = ((step - 1)/n)*n + 1;
9322         }
9323         if (step_pcoupl >= comm->partition_step)
9324         {
9325             bBoxChanged = TRUE;
9326         }
9327     }
9328
9329     bNStGlobalComm = (step % nstglobalcomm == 0);
9330
9331     if (!comm->bDynLoadBal)
9332     {
9333         bDoDLB = FALSE;
9334     }
9335     else
9336     {
9337         /* Should we do dynamic load balacing this step?
9338          * Since it requires (possibly expensive) global communication,
9339          * we might want to do DLB less frequently.
9340          */
9341         if (bBoxChanged || ir->epc != epcNO)
9342         {
9343             bDoDLB = bBoxChanged;
9344         }
9345         else
9346         {
9347             bDoDLB = bNStGlobalComm;
9348         }
9349     }
9350
9351     /* Check if we have recorded loads on the nodes */
9352     if (comm->bRecordLoad && dd_load_count(comm))
9353     {
9354         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9355         {
9356             /* Check if we should use DLB at the second partitioning
9357              * and every 100 partitionings,
9358              * so the extra communication cost is negligible.
9359              */
9360             n         = max(100, nstglobalcomm);
9361             bCheckDLB = (comm->n_load_collect == 0 ||
9362                          comm->n_load_have % n == n-1);
9363         }
9364         else
9365         {
9366             bCheckDLB = FALSE;
9367         }
9368
9369         /* Print load every nstlog, first and last step to the log file */
9370         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9371                     comm->n_load_collect == 0 ||
9372                     (ir->nsteps >= 0 &&
9373                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9374
9375         /* Avoid extra communication due to verbose screen output
9376          * when nstglobalcomm is set.
9377          */
9378         if (bDoDLB || bLogLoad || bCheckDLB ||
9379             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9380         {
9381             get_load_distribution(dd, wcycle);
9382             if (DDMASTER(dd))
9383             {
9384                 if (bLogLoad)
9385                 {
9386                     dd_print_load(fplog, dd, step-1);
9387                 }
9388                 if (bVerbose)
9389                 {
9390                     dd_print_load_verbose(dd);
9391                 }
9392             }
9393             comm->n_load_collect++;
9394
9395             if (bCheckDLB)
9396             {
9397                 /* Since the timings are node dependent, the master decides */
9398                 if (DDMASTER(dd))
9399                 {
9400                     bTurnOnDLB =
9401                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9402                     if (debug)
9403                     {
9404                         fprintf(debug, "step %s, imb loss %f\n",
9405                                 gmx_step_str(step, sbuf),
9406                                 dd_force_imb_perf_loss(dd));
9407                     }
9408                 }
9409                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9410                 if (bTurnOnDLB)
9411                 {
9412                     turn_on_dlb(fplog, cr, step);
9413                     bDoDLB = TRUE;
9414                 }
9415             }
9416         }
9417         comm->n_load_have++;
9418     }
9419
9420     cgs_gl = &comm->cgs_gl;
9421
9422     bRedist = FALSE;
9423     if (bMasterState)
9424     {
9425         /* Clear the old state */
9426         clear_dd_indices(dd, 0, 0);
9427         ncgindex_set = 0;
9428
9429         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9430                   TRUE, cgs_gl, state_global->x, &ddbox);
9431
9432         get_cg_distribution(fplog, step, dd, cgs_gl,
9433                             state_global->box, &ddbox, state_global->x);
9434
9435         dd_distribute_state(dd, cgs_gl,
9436                             state_global, state_local, f);
9437
9438         dd_make_local_cgs(dd, &top_local->cgs);
9439
9440         /* Ensure that we have space for the new distribution */
9441         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9442
9443         if (fr->cutoff_scheme == ecutsGROUP)
9444         {
9445             calc_cgcm(fplog, 0, dd->ncg_home,
9446                       &top_local->cgs, state_local->x, fr->cg_cm);
9447         }
9448
9449         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9450
9451         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9452     }
9453     else if (state_local->ddp_count != dd->ddp_count)
9454     {
9455         if (state_local->ddp_count > dd->ddp_count)
9456         {
9457             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9458         }
9459
9460         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9461         {
9462             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9463         }
9464
9465         /* Clear the old state */
9466         clear_dd_indices(dd, 0, 0);
9467
9468         /* Build the new indices */
9469         rebuild_cgindex(dd, cgs_gl->index, state_local);
9470         make_dd_indices(dd, cgs_gl->index, 0);
9471         ncgindex_set = dd->ncg_home;
9472
9473         if (fr->cutoff_scheme == ecutsGROUP)
9474         {
9475             /* Redetermine the cg COMs */
9476             calc_cgcm(fplog, 0, dd->ncg_home,
9477                       &top_local->cgs, state_local->x, fr->cg_cm);
9478         }
9479
9480         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9481
9482         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9483
9484         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9485                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9486
9487         bRedist = comm->bDynLoadBal;
9488     }
9489     else
9490     {
9491         /* We have the full state, only redistribute the cgs */
9492
9493         /* Clear the non-home indices */
9494         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9495         ncgindex_set = 0;
9496
9497         /* Avoid global communication for dim's without pbc and -gcom */
9498         if (!bNStGlobalComm)
9499         {
9500             copy_rvec(comm->box0, ddbox.box0    );
9501             copy_rvec(comm->box_size, ddbox.box_size);
9502         }
9503         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9504                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9505
9506         bBoxChanged = TRUE;
9507         bRedist     = TRUE;
9508     }
9509     /* For dim's without pbc and -gcom */
9510     copy_rvec(ddbox.box0, comm->box0    );
9511     copy_rvec(ddbox.box_size, comm->box_size);
9512
9513     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9514                       step, wcycle);
9515
9516     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9517     {
9518         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9519     }
9520
9521     /* Check if we should sort the charge groups */
9522     if (comm->nstSortCG > 0)
9523     {
9524         bSortCG = (bMasterState ||
9525                    (bRedist && (step % comm->nstSortCG == 0)));
9526     }
9527     else
9528     {
9529         bSortCG = FALSE;
9530     }
9531
9532     ncg_home_old = dd->ncg_home;
9533
9534     ncg_moved = 0;
9535     if (bRedist)
9536     {
9537         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9538
9539         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9540                            state_local, f, fr,
9541                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9542
9543         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9544     }
9545
9546     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9547                           dd, &ddbox,
9548                           &comm->cell_x0, &comm->cell_x1,
9549                           dd->ncg_home, fr->cg_cm,
9550                           cell_ns_x0, cell_ns_x1, &grid_density);
9551
9552     if (bBoxChanged)
9553     {
9554         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9555     }
9556
9557     switch (fr->cutoff_scheme)
9558     {
9559         case ecutsGROUP:
9560             copy_ivec(fr->ns.grid->n, ncells_old);
9561             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9562                        state_local->box, cell_ns_x0, cell_ns_x1,
9563                        fr->rlistlong, grid_density);
9564             break;
9565         case ecutsVERLET:
9566             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9567             break;
9568         default:
9569             gmx_incons("unimplemented");
9570     }
9571     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9572     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9573
9574     if (bSortCG)
9575     {
9576         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9577
9578         /* Sort the state on charge group position.
9579          * This enables exact restarts from this step.
9580          * It also improves performance by about 15% with larger numbers
9581          * of atoms per node.
9582          */
9583
9584         /* Fill the ns grid with the home cell,
9585          * so we can sort with the indices.
9586          */
9587         set_zones_ncg_home(dd);
9588
9589         switch (fr->cutoff_scheme)
9590         {
9591             case ecutsVERLET:
9592                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9593
9594                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9595                                   0,
9596                                   comm->zones.size[0].bb_x0,
9597                                   comm->zones.size[0].bb_x1,
9598                                   0, dd->ncg_home,
9599                                   comm->zones.dens_zone0,
9600                                   fr->cginfo,
9601                                   state_local->x,
9602                                   ncg_moved, bRedist ? comm->moved : NULL,
9603                                   fr->nbv->grp[eintLocal].kernel_type,
9604                                   fr->nbv->grp[eintLocal].nbat);
9605
9606                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9607                 break;
9608             case ecutsGROUP:
9609                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9610                           0, dd->ncg_home, fr->cg_cm);
9611
9612                 copy_ivec(fr->ns.grid->n, ncells_new);
9613                 break;
9614             default:
9615                 gmx_incons("unimplemented");
9616         }
9617
9618         bResortAll = bMasterState;
9619
9620         /* Check if we can user the old order and ns grid cell indices
9621          * of the charge groups to sort the charge groups efficiently.
9622          */
9623         if (ncells_new[XX] != ncells_old[XX] ||
9624             ncells_new[YY] != ncells_old[YY] ||
9625             ncells_new[ZZ] != ncells_old[ZZ])
9626         {
9627             bResortAll = TRUE;
9628         }
9629
9630         if (debug)
9631         {
9632             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9633                     gmx_step_str(step, sbuf), dd->ncg_home);
9634         }
9635         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9636                       bResortAll ? -1 : ncg_home_old);
9637         /* Rebuild all the indices */
9638         ga2la_clear(dd->ga2la);
9639         ncgindex_set = 0;
9640
9641         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9642     }
9643
9644     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9645
9646     /* Setup up the communication and communicate the coordinates */
9647     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9648
9649     /* Set the indices */
9650     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9651
9652     /* Set the charge group boundaries for neighbor searching */
9653     set_cg_boundaries(&comm->zones);
9654
9655     if (fr->cutoff_scheme == ecutsVERLET)
9656     {
9657         set_zones_size(dd, state_local->box, &ddbox,
9658                        bSortCG ? 1 : 0, comm->zones.n);
9659     }
9660
9661     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9662
9663     /*
9664        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9665                  -1,state_local->x,state_local->box);
9666      */
9667
9668     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9669
9670     /* Extract a local topology from the global topology */
9671     for (i = 0; i < dd->ndim; i++)
9672     {
9673         np[dd->dim[i]] = comm->cd[i].np;
9674     }
9675     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9676                       comm->cellsize_min, np,
9677                       fr,
9678                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9679                       vsite, top_global, top_local);
9680
9681     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9682
9683     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9684
9685     /* Set up the special atom communication */
9686     n = comm->nat[ddnatZONE];
9687     for (i = ddnatZONE+1; i < ddnatNR; i++)
9688     {
9689         switch (i)
9690         {
9691             case ddnatVSITE:
9692                 if (vsite && vsite->n_intercg_vsite)
9693                 {
9694                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9695                 }
9696                 break;
9697             case ddnatCON:
9698                 if (dd->bInterCGcons || dd->bInterCGsettles)
9699                 {
9700                     /* Only for inter-cg constraints we need special code */
9701                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9702                                                   constr, ir->nProjOrder,
9703                                                   top_local->idef.il);
9704                 }
9705                 break;
9706             default:
9707                 gmx_incons("Unknown special atom type setup");
9708         }
9709         comm->nat[i] = n;
9710     }
9711
9712     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9713
9714     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9715
9716     /* Make space for the extra coordinates for virtual site
9717      * or constraint communication.
9718      */
9719     state_local->natoms = comm->nat[ddnatNR-1];
9720     if (state_local->natoms > state_local->nalloc)
9721     {
9722         dd_realloc_state(state_local, f, state_local->natoms);
9723     }
9724
9725     if (fr->bF_NoVirSum)
9726     {
9727         if (vsite && vsite->n_intercg_vsite)
9728         {
9729             nat_f_novirsum = comm->nat[ddnatVSITE];
9730         }
9731         else
9732         {
9733             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9734             {
9735                 nat_f_novirsum = dd->nat_tot;
9736             }
9737             else
9738             {
9739                 nat_f_novirsum = dd->nat_home;
9740             }
9741         }
9742     }
9743     else
9744     {
9745         nat_f_novirsum = 0;
9746     }
9747
9748     /* Set the number of atoms required for the force calculation.
9749      * Forces need to be constrained when using a twin-range setup
9750      * or with energy minimization. For simple simulations we could
9751      * avoid some allocation, zeroing and copying, but this is
9752      * probably not worth the complications ande checking.
9753      */
9754     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9755                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9756
9757     /* We make the all mdatoms up to nat_tot_con.
9758      * We could save some work by only setting invmass
9759      * between nat_tot and nat_tot_con.
9760      */
9761     /* This call also sets the new number of home particles to dd->nat_home */
9762     atoms2md(top_global, ir,
9763              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9764
9765     /* Now we have the charges we can sort the FE interactions */
9766     dd_sort_local_top(dd, mdatoms, top_local);
9767
9768     if (vsite != NULL)
9769     {
9770         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9771         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9772                                   mdatoms, FALSE, vsite);
9773     }
9774
9775     if (shellfc)
9776     {
9777         /* Make the local shell stuff, currently no communication is done */
9778         make_local_shells(cr, mdatoms, shellfc);
9779     }
9780
9781     if (ir->implicit_solvent)
9782     {
9783         make_local_gb(cr, fr->born, ir->gb_algorithm);
9784     }
9785
9786     setup_bonded_threading(fr, &top_local->idef);
9787
9788     if (!(cr->duty & DUTY_PME))
9789     {
9790         /* Send the charges and/or c6/sigmas to our PME only node */
9791         gmx_pme_send_parameters(cr,
9792                                 fr->ic,
9793                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9794                                 mdatoms->chargeA, mdatoms->chargeB,
9795                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9796                                 mdatoms->sigmaA, mdatoms->sigmaB,
9797                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9798     }
9799
9800     if (constr)
9801     {
9802         set_constraints(constr, top_local, ir, mdatoms, cr);
9803     }
9804
9805     if (ir->ePull != epullNO)
9806     {
9807         /* Update the local pull groups */
9808         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9809     }
9810
9811     if (ir->bRot)
9812     {
9813         /* Update the local rotation groups */
9814         dd_make_local_rotation_groups(dd, ir->rot);
9815     }
9816
9817     if (ir->eSwapCoords != eswapNO)
9818     {
9819         /* Update the local groups needed for ion swapping */
9820         dd_make_local_swap_groups(dd, ir->swap);
9821     }
9822
9823     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9824     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9825
9826     add_dd_statistics(dd);
9827
9828     /* Make sure we only count the cycles for this DD partitioning */
9829     clear_dd_cycle_counts(dd);
9830
9831     /* Because the order of the atoms might have changed since
9832      * the last vsite construction, we need to communicate the constructing
9833      * atom coordinates again (for spreading the forces this MD step).
9834      */
9835     dd_move_x_vsites(dd, state_local->box, state_local->x);
9836
9837     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9838
9839     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9840     {
9841         dd_move_x(dd, state_local->box, state_local->x);
9842         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9843                      -1, state_local->x, state_local->box);
9844     }
9845
9846     /* Store the partitioning step */
9847     comm->partition_step = step;
9848
9849     /* Increase the DD partitioning counter */
9850     dd->ddp_count++;
9851     /* The state currently matches this DD partitioning count, store it */
9852     state_local->ddp_count = dd->ddp_count;
9853     if (bMasterState)
9854     {
9855         /* The DD master node knows the complete cg distribution,
9856          * store the count so we can possibly skip the cg info communication.
9857          */
9858         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9859     }
9860
9861     if (comm->DD_debug > 0)
9862     {
9863         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9864         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9865                                 "after partitioning");
9866     }
9867 }