src/gromacs/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "gmx_fatal.h"
  31 #include "gmx_fatal_collective.h"
  32 #include "vec.h"
  33 #include "domdec.h"
  34 #include "domdec_network.h"
  35 #include "nrnb.h"
  36 #include "pbc.h"
  37 #include "chargegroup.h"
  38 #include "constr.h"
  39 #include "mdatoms.h"
  40 #include "names.h"
  41 #include "pdbio.h"
  42 #include "futil.h"
  43 #include "force.h"
  44 #include "pme.h"
  45 #include "pull.h"
  46 #include "pull_rotation.h"
  47 #include "gmx_wallcycle.h"
  48 #include "mdrun.h"
  49 #include "nsgrid.h"
  50 #include "shellfc.h"
  51 #include "mtop_util.h"
  52 #include "gmxfio.h"
  53 #include "gmx_ga2la.h"
  54 #include "gmx_sort.h"
  55 #include "macros.h"
  56 #include "nbnxn_search.h"
  57 #include "bondf.h"
  58 #include "gmx_omp_nthreads.h"
  59
  60 #ifdef GMX_LIB_MPI
  61 #include <mpi.h>
  62 #endif
  63 #ifdef GMX_THREAD_MPI
  64 #include "tmpi.h"
  65 #endif
  66
  67 #define DDRANK(dd,rank)    (rank)
  68 #define DDMASTERRANK(dd)   (dd->masterrank)
  69
  70 typedef struct gmx_domdec_master
  71 {
  72     /* The cell boundaries */
  73     real **cell_x;
  74     /* The global charge group division */
  75     int  *ncg;     /* Number of home charge groups for each node */
  76     int  *index;   /* Index of nnodes+1 into cg */
  77     int  *cg;      /* Global charge group index */
  78     int  *nat;     /* Number of home atoms for each node. */
  79     int  *ibuf;    /* Buffer for communication */
  80     rvec *vbuf;    /* Buffer for state scattering and gathering */
  81 } gmx_domdec_master_t;
  82
  83 typedef struct
  84 {
  85     /* The numbers of charge groups to send and receive for each cell
  86      * that requires communication, the last entry contains the total
  87      * number of atoms that needs to be communicated.
  88      */
  89     int nsend[DD_MAXIZONE+2];
  90     int nrecv[DD_MAXIZONE+2];
  91     /* The charge groups to send */
  92     int *index;
  93     int nalloc;
  94     /* The atom range for non-in-place communication */
  95     int cell2at0[DD_MAXIZONE];
  96     int cell2at1[DD_MAXIZONE];
  97 } gmx_domdec_ind_t;
  98
  99 typedef struct
 100 {
 101     int  np;                   /* Number of grid pulses in this dimension */
 102     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
 103     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
 104     int  np_nalloc;
 105     gmx_bool bInPlace;             /* Can we communicate in place?            */
 106 } gmx_domdec_comm_dim_t;
 107
 108 typedef struct
 109 {
 110     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 111     real *cell_f;      /* State var.: cell boundaries, box relative      */
 112     real *old_cell_f;  /* Temp. var.: old cell size                      */
 113     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 114     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 115     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 116     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 117     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 118     real *buf_ncd;     /* Temp. var.                                     */
 119 } gmx_domdec_root_t;
 120
 121 #define DD_NLOAD_MAX 9
 122
 123 /* Here floats are accurate enough, since these variables
 124  * only influence the load balancing, not the actual MD results.
 125  */
 126 typedef struct
 127 {
 128     int  nload;
 129     float *load;
 130     float sum;
 131     float max;
 132     float sum_m;
 133     float cvol_min;
 134     float mdf;
 135     float pme;
 136     int   flags;
 137 } gmx_domdec_load_t;
 138
 139 typedef struct
 140 {
 141     int  nsc;
 142     int  ind_gl;
 143     int  ind;
 144 } gmx_cgsort_t;
 145
 146 typedef struct
 147 {
 148     gmx_cgsort_t *sort;
 149     gmx_cgsort_t *sort2;
 150     int  sort_nalloc;
 151     gmx_cgsort_t *sort_new;
 152     int  sort_new_nalloc;
 153     int  *ibuf;
 154     int  ibuf_nalloc;
 155 } gmx_domdec_sort_t;
 156
 157 typedef struct
 158 {
 159     rvec *v;
 160     int  nalloc;
 161 } vec_rvec_t;
 162
 163 /* This enum determines the order of the coordinates.
 164  * ddnatHOME and ddnatZONE should be first and second,
 165  * the others can be ordered as wanted.
 166  */
 167 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 168
 169 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 170 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 171
 172 typedef struct
 173 {
 174     int  dim;      /* The dimension                                          */
 175     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 176     int  nslab;    /* The number of PME slabs in this dimension              */
 177     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 178     int  *pp_min;  /* The minimum pp node location, size nslab               */
 179     int  *pp_max;  /* The maximum pp node location,size nslab                */
 180     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 181 } gmx_ddpme_t;
 182
 183 typedef struct
 184 {
 185     real min0;    /* The minimum bottom of this zone                        */
 186     real max1;    /* The maximum top of this zone                           */
 187     real min1;    /* The minimum top of this zone                           */
 188     real mch0;    /* The maximum bottom communicaton height for this zone   */
 189     real mch1;    /* The maximum top communicaton height for this zone      */
 190     real p1_0;    /* The bottom value of the first cell in this zone        */
 191     real p1_1;    /* The top value of the first cell in this zone           */
 192 } gmx_ddzone_t;
 193
 194 typedef struct
 195 {
 196     gmx_domdec_ind_t ind;
 197     int *ibuf;
 198     int ibuf_nalloc;
 199     vec_rvec_t vbuf;
 200     int nsend;
 201     int nat;
 202     int nsend_zone;
 203 } dd_comm_setup_work_t;
 204
 205 typedef struct gmx_domdec_comm
 206 {
 207     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 208      * unless stated otherwise.
 209      */
 210
 211     /* The number of decomposition dimensions for PME, 0: no PME */
 212     int  npmedecompdim;
 213     /* The number of nodes doing PME (PP/PME or only PME) */
 214     int  npmenodes;
 215     int  npmenodes_x;
 216     int  npmenodes_y;
 217     /* The communication setup including the PME only nodes */
 218     gmx_bool bCartesianPP_PME;
 219     ivec ntot;
 220     int  cartpmedim;
 221     int  *pmenodes;          /* size npmenodes                         */
 222     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 223                               * but with bCartesianPP_PME              */
 224     gmx_ddpme_t ddpme[2];
 225
 226     /* The DD particle-particle nodes only */
 227     gmx_bool bCartesianPP;
 228     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 229
 230     /* The global charge groups */
 231     t_block cgs_gl;
 232
 233     /* Should we sort the cgs */
 234     int  nstSortCG;
 235     gmx_domdec_sort_t *sort;
 236
 237     /* Are there charge groups? */
 238     gmx_bool bCGs;
 239
 240     /* Are there bonded and multi-body interactions between charge groups? */
 241     gmx_bool bInterCGBondeds;
 242     gmx_bool bInterCGMultiBody;
 243
 244     /* Data for the optional bonded interaction atom communication range */
 245     gmx_bool bBondComm;
 246     t_blocka *cglink;
 247     char *bLocalCG;
 248
 249     /* The DLB option */
 250     int  eDLB;
 251     /* Are we actually using DLB? */
 252     gmx_bool bDynLoadBal;
 253
 254     /* Cell sizes for static load balancing, first index cartesian */
 255     real **slb_frac;
 256
 257     /* The width of the communicated boundaries */
 258     real cutoff_mbody;
 259     real cutoff;
 260     /* The minimum cell size (including triclinic correction) */
 261     rvec cellsize_min;
 262     /* For dlb, for use with edlbAUTO */
 263     rvec cellsize_min_dlb;
 264     /* The lower limit for the DD cell size with DLB */
 265     real cellsize_limit;
 266     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 267     gmx_bool bVacDLBNoLimit;
 268
 269     /* With PME load balancing we set limits on DLB */
 270     gmx_bool bPMELoadBalDLBLimits;
 271     /* DLB needs to take into account that we want to allow this maximum
 272      * cut-off (for PME load balancing), this could limit cell boundaries.
 273      */
 274     real PMELoadBal_max_cutoff;
 275
 276     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 277     ivec tric_dir;
 278     /* box0 and box_size are required with dim's without pbc and -gcom */
 279     rvec box0;
 280     rvec box_size;
 281
 282     /* The cell boundaries */
 283     rvec cell_x0;
 284     rvec cell_x1;
 285
 286     /* The old location of the cell boundaries, to check cg displacements */
 287     rvec old_cell_x0;
 288     rvec old_cell_x1;
 289
 290     /* The communication setup and charge group boundaries for the zones */
 291     gmx_domdec_zones_t zones;
 292
 293     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 294      * cell boundaries of neighboring cells for dynamic load balancing.
 295      */
 296     gmx_ddzone_t zone_d1[2];
 297     gmx_ddzone_t zone_d2[2][2];
 298
 299     /* The coordinate/force communication setup and indices */
 300     gmx_domdec_comm_dim_t cd[DIM];
 301     /* The maximum number of cells to communicate with in one dimension */
 302     int  maxpulse;
 303
 304     /* Which cg distribution is stored on the master node */
 305     int master_cg_ddp_count;
 306
 307     /* The number of cg's received from the direct neighbors */
 308     int  zone_ncg1[DD_MAXZONE];
 309
 310     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 311     int  nat[ddnatNR];
 312
 313     /* Array for signalling if atoms have moved to another domain */
 314     int  *moved;
 315     int  moved_nalloc;
 316
 317     /* Communication buffer for general use */
 318     int  *buf_int;
 319     int  nalloc_int;
 320
 321     /* Communication buffer for general use */
 322     vec_rvec_t vbuf;
 323
 324     /* Temporary storage for thread parallel communication setup */
 325     int nth;
 326     dd_comm_setup_work_t *dth;
 327
 328     /* Communication buffers only used with multiple grid pulses */
 329     int  *buf_int2;
 330     int  nalloc_int2;
 331     vec_rvec_t vbuf2;
 332
 333     /* Communication buffers for local redistribution */
 334     int  **cggl_flag;
 335     int  cggl_flag_nalloc[DIM*2];
 336     rvec **cgcm_state;
 337     int  cgcm_state_nalloc[DIM*2];
 338
 339     /* Cell sizes for dynamic load balancing */
 340     gmx_domdec_root_t **root;
 341     real *cell_f_row;
 342     real cell_f0[DIM];
 343     real cell_f1[DIM];
 344     real cell_f_max0[DIM];
 345     real cell_f_min1[DIM];
 346
 347     /* Stuff for load communication */
 348     gmx_bool bRecordLoad;
 349     gmx_domdec_load_t *load;
 350 #ifdef GMX_MPI
 351     MPI_Comm *mpi_comm_load;
 352 #endif
 353
 354     /* Maximum DLB scaling per load balancing step in percent */
 355     int dlb_scale_lim;
 356
 357     /* Cycle counters */
 358     float cycl[ddCyclNr];
 359     int   cycl_n[ddCyclNr];
 360     float cycl_max[ddCyclNr];
 361     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 362     int eFlop;
 363     double flop;
 364     int    flop_n;
 365     /* Have often have did we have load measurements */
 366     int    n_load_have;
 367     /* Have often have we collected the load measurements */
 368     int    n_load_collect;
 369
 370     /* Statistics */
 371     double sum_nat[ddnatNR-ddnatZONE];
 372     int    ndecomp;
 373     int    nload;
 374     double load_step;
 375     double load_sum;
 376     double load_max;
 377     ivec   load_lim;
 378     double load_mdf;
 379     double load_pme;
 380
 381     /* The last partition step */
 382     gmx_large_int_t partition_step;
 383
 384     /* Debugging */
 385     int  nstDDDump;
 386     int  nstDDDumpGrid;
 387     int  DD_debug;
 388 } gmx_domdec_comm_t;
 389
 390 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 391 #define DD_CGIBS 2
 392
 393 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 394 #define DD_FLAG_NRCG  65535
 395 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 396 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 397
 398 /* Zone permutation required to obtain consecutive charge groups
 399  * for neighbor searching.
 400  */
 401 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 402
 403 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 404  * components see only j zones with that component 0.
 405  */
 406
 407 /* The DD zone order */
 408 static const ivec dd_zo[DD_MAXZONE] =
 409   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 410
 411 /* The 3D setup */
 412 #define dd_z3n  8
 413 #define dd_zp3n 4
 414 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 415
 416 /* The 2D setup */
 417 #define dd_z2n  4
 418 #define dd_zp2n 2
 419 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 420
 421 /* The 1D setup */
 422 #define dd_z1n  2
 423 #define dd_zp1n 1
 424 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 425
 426 /* Factors used to avoid problems due to rounding issues */
 427 #define DD_CELL_MARGIN       1.0001
 428 #define DD_CELL_MARGIN2      1.00005
 429 /* Factor to account for pressure scaling during nstlist steps */
 430 #define DD_PRES_SCALE_MARGIN 1.02
 431
 432 /* Allowed performance loss before we DLB or warn */
 433 #define DD_PERF_LOSS 0.05
 434
 435 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 436
 437 /* Use separate MPI send and receive commands
 438  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 439  * This saves memory (and some copying for small nnodes).
 440  * For high parallelization scatter and gather calls are used.
 441  */
 442 #define GMX_DD_NNODES_SENDRECV 4
 443
 444
 445 /*
 446 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 447
 448 static void index2xyz(ivec nc,int ind,ivec xyz)
 449 {
 450   xyz[XX] = ind % nc[XX];
 451   xyz[YY] = (ind / nc[XX]) % nc[YY];
 452   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 453 }
 454 */
 455
 456 /* This order is required to minimize the coordinate communication in PME
 457  * which uses decomposition in the x direction.
 458  */
 459 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 460
 461 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 462 {
 463     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 464     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 465     xyz[ZZ] = ind % nc[ZZ];
 466 }
 467
 468 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 469 {
 470     int ddindex;
 471     int ddnodeid=-1;
 472
 473     ddindex = dd_index(dd->nc,c);
 474     if (dd->comm->bCartesianPP_PME)
 475     {
 476         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 477     }
 478     else if (dd->comm->bCartesianPP)
 479     {
 480 #ifdef GMX_MPI
 481         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 482 #endif
 483     }
 484     else
 485     {
 486         ddnodeid = ddindex;
 487     }
 488
 489     return ddnodeid;
 490 }
 491
 492 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 493 {
 494     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 495 }
 496
 497 int ddglatnr(gmx_domdec_t *dd,int i)
 498 {
 499     int atnr;
 500
 501     if (dd == NULL)
 502     {
 503         atnr = i + 1;
 504     }
 505     else
 506     {
 507         if (i >= dd->comm->nat[ddnatNR-1])
 508         {
 509             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 510         }
 511         atnr = dd->gatindex[i] + 1;
 512     }
 513
 514     return atnr;
 515 }
 516
 517 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 518 {
 519     return &dd->comm->cgs_gl;
 520 }
 521
 522 static void vec_rvec_init(vec_rvec_t *v)
 523 {
 524     v->nalloc = 0;
 525     v->v      = NULL;
 526 }
 527
 528 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 529 {
 530     if (n > v->nalloc)
 531     {
 532         v->nalloc = over_alloc_dd(n);
 533         srenew(v->v,v->nalloc);
 534     }
 535 }
 536
 537 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 538 {
 539     int i;
 540
 541     if (state->ddp_count != dd->ddp_count)
 542     {
 543         gmx_incons("The state does not the domain decomposition state");
 544     }
 545
 546     state->ncg_gl = dd->ncg_home;
 547     if (state->ncg_gl > state->cg_gl_nalloc)
 548     {
 549         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 550         srenew(state->cg_gl,state->cg_gl_nalloc);
 551     }
 552     for(i=0; i<state->ncg_gl; i++)
 553     {
 554         state->cg_gl[i] = dd->index_gl[i];
 555     }
 556
 557     state->ddp_count_cg_gl = dd->ddp_count;
 558 }
 559
 560 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 561 {
 562     return &dd->comm->zones;
 563 }
 564
 565 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 566                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 567 {
 568     gmx_domdec_zones_t *zones;
 569     int izone,d,dim;
 570
 571     zones = &dd->comm->zones;
 572
 573     izone = 0;
 574     while (icg >= zones->izone[izone].cg1)
 575     {
 576         izone++;
 577     }
 578
 579     if (izone == 0)
 580     {
 581         *jcg0 = icg;
 582     }
 583     else if (izone < zones->nizone)
 584     {
 585         *jcg0 = zones->izone[izone].jcg0;
 586     }
 587     else
 588     {
 589         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 590                   icg,izone,zones->nizone);
 591     }
 592
 593     *jcg1 = zones->izone[izone].jcg1;
 594
 595     for(d=0; d<dd->ndim; d++)
 596     {
 597         dim = dd->dim[d];
 598         shift0[dim] = zones->izone[izone].shift0[dim];
 599         shift1[dim] = zones->izone[izone].shift1[dim];
 600         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 601         {
 602             /* A conservative approach, this can be optimized */
 603             shift0[dim] -= 1;
 604             shift1[dim] += 1;
 605         }
 606     }
 607 }
 608
 609 int dd_natoms_vsite(gmx_domdec_t *dd)
 610 {
 611     return dd->comm->nat[ddnatVSITE];
 612 }
 613
 614 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 615 {
 616     *at_start = dd->comm->nat[ddnatCON-1];
 617     *at_end   = dd->comm->nat[ddnatCON];
 618 }
 619
 620 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 621 {
 622     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 623     int  *index,*cgindex;
 624     gmx_domdec_comm_t *comm;
 625     gmx_domdec_comm_dim_t *cd;
 626     gmx_domdec_ind_t *ind;
 627     rvec shift={0,0,0},*buf,*rbuf;
 628     gmx_bool bPBC,bScrew;
 629
 630     comm = dd->comm;
 631
 632     cgindex = dd->cgindex;
 633
 634     buf = comm->vbuf.v;
 635
 636     nzone = 1;
 637     nat_tot = dd->nat_home;
 638     for(d=0; d<dd->ndim; d++)
 639     {
 640         bPBC   = (dd->ci[dd->dim[d]] == 0);
 641         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 642         if (bPBC)
 643         {
 644             copy_rvec(box[dd->dim[d]],shift);
 645         }
 646         cd = &comm->cd[d];
 647         for(p=0; p<cd->np; p++)
 648         {
 649             ind = &cd->ind[p];
 650             index = ind->index;
 651             n = 0;
 652             if (!bPBC)
 653             {
 654                 for(i=0; i<ind->nsend[nzone]; i++)
 655                 {
 656                     at0 = cgindex[index[i]];
 657                     at1 = cgindex[index[i]+1];
 658                     for(j=at0; j<at1; j++)
 659                     {
 660                         copy_rvec(x[j],buf[n]);
 661                         n++;
 662                     }
 663                 }
 664             }
 665             else if (!bScrew)
 666             {
 667                 for(i=0; i<ind->nsend[nzone]; i++)
 668                 {
 669                     at0 = cgindex[index[i]];
 670                     at1 = cgindex[index[i]+1];
 671                     for(j=at0; j<at1; j++)
 672                     {
 673                         /* We need to shift the coordinates */
 674                         rvec_add(x[j],shift,buf[n]);
 675                         n++;
 676                     }
 677                 }
 678             }
 679             else
 680             {
 681                 for(i=0; i<ind->nsend[nzone]; i++)
 682                 {
 683                     at0 = cgindex[index[i]];
 684                     at1 = cgindex[index[i]+1];
 685                     for(j=at0; j<at1; j++)
 686                     {
 687                         /* Shift x */
 688                         buf[n][XX] = x[j][XX] + shift[XX];
 689                         /* Rotate y and z.
 690                          * This operation requires a special shift force
 691                          * treatment, which is performed in calc_vir.
 692                          */
 693                         buf[n][YY] = box[YY][YY] - x[j][YY];
 694                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 695                         n++;
 696                     }
 697                 }
 698             }
 699
 700             if (cd->bInPlace)
 701             {
 702                 rbuf = x + nat_tot;
 703             }
 704             else
 705             {
 706                 rbuf = comm->vbuf2.v;
 707             }
 708             /* Send and receive the coordinates */
 709             dd_sendrecv_rvec(dd, d, dddirBackward,
 710                              buf,  ind->nsend[nzone+1],
 711                              rbuf, ind->nrecv[nzone+1]);
 712             if (!cd->bInPlace)
 713             {
 714                 j = 0;
 715                 for(zone=0; zone<nzone; zone++)
 716                 {
 717                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 718                     {
 719                         copy_rvec(rbuf[j],x[i]);
 720                         j++;
 721                     }
 722                 }
 723             }
 724             nat_tot += ind->nrecv[nzone+1];
 725         }
 726         nzone += nzone;
 727     }
 728 }
 729
 730 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 731 {
 732     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 733     int  *index,*cgindex;
 734     gmx_domdec_comm_t *comm;
 735     gmx_domdec_comm_dim_t *cd;
 736     gmx_domdec_ind_t *ind;
 737     rvec *buf,*sbuf;
 738     ivec vis;
 739     int  is;
 740     gmx_bool bPBC,bScrew;
 741
 742     comm = dd->comm;
 743
 744     cgindex = dd->cgindex;
 745
 746     buf = comm->vbuf.v;
 747
 748     n = 0;
 749     nzone = comm->zones.n/2;
 750     nat_tot = dd->nat_tot;
 751     for(d=dd->ndim-1; d>=0; d--)
 752     {
 753         bPBC   = (dd->ci[dd->dim[d]] == 0);
 754         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 755         if (fshift == NULL && !bScrew)
 756         {
 757             bPBC = FALSE;
 758         }
 759         /* Determine which shift vector we need */
 760         clear_ivec(vis);
 761         vis[dd->dim[d]] = 1;
 762         is = IVEC2IS(vis);
 763
 764         cd = &comm->cd[d];
 765         for(p=cd->np-1; p>=0; p--) {
 766             ind = &cd->ind[p];
 767             nat_tot -= ind->nrecv[nzone+1];
 768             if (cd->bInPlace)
 769             {
 770                 sbuf = f + nat_tot;
 771             }
 772             else
 773             {
 774                 sbuf = comm->vbuf2.v;
 775                 j = 0;
 776                 for(zone=0; zone<nzone; zone++)
 777                 {
 778                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 779                     {
 780                         copy_rvec(f[i],sbuf[j]);
 781                         j++;
 782                     }
 783                 }
 784             }
 785             /* Communicate the forces */
 786             dd_sendrecv_rvec(dd, d, dddirForward,
 787                              sbuf, ind->nrecv[nzone+1],
 788                              buf,  ind->nsend[nzone+1]);
 789             index = ind->index;
 790             /* Add the received forces */
 791             n = 0;
 792             if (!bPBC)
 793             {
 794                 for(i=0; i<ind->nsend[nzone]; i++)
 795                 {
 796                     at0 = cgindex[index[i]];
 797                     at1 = cgindex[index[i]+1];
 798                     for(j=at0; j<at1; j++)
 799                     {
 800                         rvec_inc(f[j],buf[n]);
 801                         n++;
 802                     }
 803                 }
 804             }
 805             else if (!bScrew)
 806             {
 807                 for(i=0; i<ind->nsend[nzone]; i++)
 808                 {
 809                     at0 = cgindex[index[i]];
 810                     at1 = cgindex[index[i]+1];
 811                     for(j=at0; j<at1; j++)
 812                     {
 813                         rvec_inc(f[j],buf[n]);
 814                         /* Add this force to the shift force */
 815                         rvec_inc(fshift[is],buf[n]);
 816                         n++;
 817                     }
 818                 }
 819             }
 820             else
 821             {
 822                 for(i=0; i<ind->nsend[nzone]; i++)
 823                 {
 824                     at0 = cgindex[index[i]];
 825                     at1 = cgindex[index[i]+1];
 826                     for(j=at0; j<at1; j++)
 827                     {
 828                         /* Rotate the force */
 829                         f[j][XX] += buf[n][XX];
 830                         f[j][YY] -= buf[n][YY];
 831                         f[j][ZZ] -= buf[n][ZZ];
 832                         if (fshift)
 833                         {
 834                             /* Add this force to the shift force */
 835                             rvec_inc(fshift[is],buf[n]);
 836                         }
 837                         n++;
 838                     }
 839                 }
 840             }
 841         }
 842         nzone /= 2;
 843     }
 844 }
 845
 846 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 847 {
 848     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 849     int  *index,*cgindex;
 850     gmx_domdec_comm_t *comm;
 851     gmx_domdec_comm_dim_t *cd;
 852     gmx_domdec_ind_t *ind;
 853     real *buf,*rbuf;
 854
 855     comm = dd->comm;
 856
 857     cgindex = dd->cgindex;
 858
 859     buf = &comm->vbuf.v[0][0];
 860
 861     nzone = 1;
 862     nat_tot = dd->nat_home;
 863     for(d=0; d<dd->ndim; d++)
 864     {
 865         cd = &comm->cd[d];
 866         for(p=0; p<cd->np; p++)
 867         {
 868             ind = &cd->ind[p];
 869             index = ind->index;
 870             n = 0;
 871             for(i=0; i<ind->nsend[nzone]; i++)
 872             {
 873                 at0 = cgindex[index[i]];
 874                 at1 = cgindex[index[i]+1];
 875                 for(j=at0; j<at1; j++)
 876                 {
 877                     buf[n] = v[j];
 878                     n++;
 879                 }
 880             }
 881
 882             if (cd->bInPlace)
 883             {
 884                 rbuf = v + nat_tot;
 885             }
 886             else
 887             {
 888                 rbuf = &comm->vbuf2.v[0][0];
 889             }
 890             /* Send and receive the coordinates */
 891             dd_sendrecv_real(dd, d, dddirBackward,
 892                              buf,  ind->nsend[nzone+1],
 893                              rbuf, ind->nrecv[nzone+1]);
 894             if (!cd->bInPlace)
 895             {
 896                 j = 0;
 897                 for(zone=0; zone<nzone; zone++)
 898                 {
 899                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 900                     {
 901                         v[i] = rbuf[j];
 902                         j++;
 903                     }
 904                 }
 905             }
 906             nat_tot += ind->nrecv[nzone+1];
 907         }
 908         nzone += nzone;
 909     }
 910 }
 911
 912 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 913 {
 914     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 915     int  *index,*cgindex;
 916     gmx_domdec_comm_t *comm;
 917     gmx_domdec_comm_dim_t *cd;
 918     gmx_domdec_ind_t *ind;
 919     real *buf,*sbuf;
 920
 921     comm = dd->comm;
 922
 923     cgindex = dd->cgindex;
 924
 925     buf = &comm->vbuf.v[0][0];
 926
 927     n = 0;
 928     nzone = comm->zones.n/2;
 929     nat_tot = dd->nat_tot;
 930     for(d=dd->ndim-1; d>=0; d--)
 931     {
 932         cd = &comm->cd[d];
 933         for(p=cd->np-1; p>=0; p--) {
 934             ind = &cd->ind[p];
 935             nat_tot -= ind->nrecv[nzone+1];
 936             if (cd->bInPlace)
 937             {
 938                 sbuf = v + nat_tot;
 939             }
 940             else
 941             {
 942                 sbuf = &comm->vbuf2.v[0][0];
 943                 j = 0;
 944                 for(zone=0; zone<nzone; zone++)
 945                 {
 946                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 947                     {
 948                         sbuf[j] = v[i];
 949                         j++;
 950                     }
 951                 }
 952             }
 953             /* Communicate the forces */
 954             dd_sendrecv_real(dd, d, dddirForward,
 955                              sbuf, ind->nrecv[nzone+1],
 956                              buf,  ind->nsend[nzone+1]);
 957             index = ind->index;
 958             /* Add the received forces */
 959             n = 0;
 960             for(i=0; i<ind->nsend[nzone]; i++)
 961             {
 962                 at0 = cgindex[index[i]];
 963                 at1 = cgindex[index[i]+1];
 964                 for(j=at0; j<at1; j++)
 965                 {
 966                     v[j] += buf[n];
 967                     n++;
 968                 }
 969             }
 970         }
 971         nzone /= 2;
 972     }
 973 }
 974
 975 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 976 {
 977     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 978             d,i,j,
 979             zone->min0,zone->max1,
 980             zone->mch0,zone->mch0,
 981             zone->p1_0,zone->p1_1);
 982 }
 983
 984
 985 #define DDZONECOMM_MAXZONE  5
 986 #define DDZONECOMM_BUFSIZE  3
 987
 988 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 989                                int ddimind,int direction,
 990                                gmx_ddzone_t *buf_s,int n_s,
 991                                gmx_ddzone_t *buf_r,int n_r)
 992 {
 993 #define ZBS  DDZONECOMM_BUFSIZE
 994     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 995     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 996     int i;
 997
 998     for(i=0; i<n_s; i++)
 999     {
1000         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1001         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1002         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1003         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1004         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1005         vbuf_s[i*ZBS+1][2] = 0;
1006         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1007         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1008         vbuf_s[i*ZBS+2][2] = 0;
1009     }
1010
1011     dd_sendrecv_rvec(dd, ddimind, direction,
1012                      vbuf_s, n_s*ZBS,
1013                      vbuf_r, n_r*ZBS);
1014
1015     for(i=0; i<n_r; i++)
1016     {
1017         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1018         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1019         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1020         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1021         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1022         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1023         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1024     }
1025
1026 #undef ZBS
1027 }
1028
1029 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1030                           rvec cell_ns_x0,rvec cell_ns_x1)
1031 {
1032     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1033     gmx_ddzone_t *zp;
1034     gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1035     gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1036     gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1037     rvec extr_s[2],extr_r[2];
1038     rvec dh;
1039     real dist_d,c=0,det;
1040     gmx_domdec_comm_t *comm;
1041     gmx_bool bPBC,bUse;
1042
1043     comm = dd->comm;
1044
1045     for(d=1; d<dd->ndim; d++)
1046     {
1047         dim = dd->dim[d];
1048         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1049         zp->min0 = cell_ns_x0[dim];
1050         zp->max1 = cell_ns_x1[dim];
1051         zp->min1 = cell_ns_x1[dim];
1052         zp->mch0 = cell_ns_x0[dim];
1053         zp->mch1 = cell_ns_x1[dim];
1054         zp->p1_0 = cell_ns_x0[dim];
1055         zp->p1_1 = cell_ns_x1[dim];
1056     }
1057
1058     for(d=dd->ndim-2; d>=0; d--)
1059     {
1060         dim  = dd->dim[d];
1061         bPBC = (dim < ddbox->npbcdim);
1062
1063         /* Use an rvec to store two reals */
1064         extr_s[d][0] = comm->cell_f0[d+1];
1065         extr_s[d][1] = comm->cell_f1[d+1];
1066         extr_s[d][2] = comm->cell_f1[d+1];
1067
1068         pos = 0;
1069         /* Store the extremes in the backward sending buffer,
1070          * so the get updated separately from the forward communication.
1071          */
1072         for(d1=d; d1<dd->ndim-1; d1++)
1073         {
1074             /* We invert the order to be able to use the same loop for buf_e */
1075             buf_s[pos].min0 = extr_s[d1][1];
1076             buf_s[pos].max1 = extr_s[d1][0];
1077             buf_s[pos].min1 = extr_s[d1][2];
1078             buf_s[pos].mch0 = 0;
1079             buf_s[pos].mch1 = 0;
1080             /* Store the cell corner of the dimension we communicate along */
1081             buf_s[pos].p1_0 = comm->cell_x0[dim];
1082             buf_s[pos].p1_1 = 0;
1083             pos++;
1084         }
1085
1086         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1087         pos++;
1088
1089         if (dd->ndim == 3 && d == 0)
1090         {
1091             buf_s[pos] = comm->zone_d2[0][1];
1092             pos++;
1093             buf_s[pos] = comm->zone_d1[0];
1094             pos++;
1095         }
1096
1097         /* We only need to communicate the extremes
1098          * in the forward direction
1099          */
1100         npulse = comm->cd[d].np;
1101         if (bPBC)
1102         {
1103             /* Take the minimum to avoid double communication */
1104             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1105         }
1106         else
1107         {
1108             /* Without PBC we should really not communicate over
1109              * the boundaries, but implementing that complicates
1110              * the communication setup and therefore we simply
1111              * do all communication, but ignore some data.
1112              */
1113             npulse_min = npulse;
1114         }
1115         for(p=0; p<npulse_min; p++)
1116         {
1117             /* Communicate the extremes forward */
1118             bUse = (bPBC || dd->ci[dim] > 0);
1119
1120             dd_sendrecv_rvec(dd, d, dddirForward,
1121                              extr_s+d, dd->ndim-d-1,
1122                              extr_r+d, dd->ndim-d-1);
1123
1124             if (bUse)
1125             {
1126                 for(d1=d; d1<dd->ndim-1; d1++)
1127                 {
1128                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1129                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1130                     extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1131                 }
1132             }
1133         }
1134
1135         buf_size = pos;
1136         for(p=0; p<npulse; p++)
1137         {
1138             /* Communicate all the zone information backward */
1139             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1140
1141             dd_sendrecv_ddzone(dd, d, dddirBackward,
1142                                buf_s, buf_size,
1143                                buf_r, buf_size);
1144
1145             clear_rvec(dh);
1146             if (p > 0)
1147             {
1148                 for(d1=d+1; d1<dd->ndim; d1++)
1149                 {
1150                     /* Determine the decrease of maximum required
1151                      * communication height along d1 due to the distance along d,
1152                      * this avoids a lot of useless atom communication.
1153                      */
1154                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1155
1156                     if (ddbox->tric_dir[dim])
1157                     {
1158                         /* c is the off-diagonal coupling between the cell planes
1159                          * along directions d and d1.
1160                          */
1161                         c = ddbox->v[dim][dd->dim[d1]][dim];
1162                     }
1163                     else
1164                     {
1165                         c = 0;
1166                     }
1167                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1168                     if (det > 0)
1169                     {
1170                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1171                     }
1172                     else
1173                     {
1174                         /* A negative value signals out of range */
1175                         dh[d1] = -1;
1176                     }
1177                 }
1178             }
1179
1180             /* Accumulate the extremes over all pulses */
1181             for(i=0; i<buf_size; i++)
1182             {
1183                 if (p == 0)
1184                 {
1185                     buf_e[i] = buf_r[i];
1186                 }
1187                 else
1188                 {
1189                     if (bUse)
1190                     {
1191                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1192                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1193                         buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1194                     }
1195
1196                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1197                     {
1198                         d1 = 1;
1199                     }
1200                     else
1201                     {
1202                         d1 = d + 1;
1203                     }
1204                     if (bUse && dh[d1] >= 0)
1205                     {
1206                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1207                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1208                     }
1209                 }
1210                 /* Copy the received buffer to the send buffer,
1211                  * to pass the data through with the next pulse.
1212                  */
1213                 buf_s[i] = buf_r[i];
1214             }
1215             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1216                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1217             {
1218                 /* Store the extremes */
1219                 pos = 0;
1220
1221                 for(d1=d; d1<dd->ndim-1; d1++)
1222                 {
1223                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1224                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1225                     extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1226                     pos++;
1227                 }
1228
1229                 if (d == 1 || (d == 0 && dd->ndim == 3))
1230                 {
1231                     for(i=d; i<2; i++)
1232                     {
1233                         comm->zone_d2[1-d][i] = buf_e[pos];
1234                         pos++;
1235                     }
1236                 }
1237                 if (d == 0)
1238                 {
1239                     comm->zone_d1[1] = buf_e[pos];
1240                     pos++;
1241                 }
1242             }
1243         }
1244     }
1245
1246     if (dd->ndim >= 2)
1247     {
1248         dim = dd->dim[1];
1249         for(i=0; i<2; i++)
1250         {
1251             if (debug)
1252             {
1253                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1254             }
1255             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1256             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1257         }
1258     }
1259     if (dd->ndim >= 3)
1260     {
1261         dim = dd->dim[2];
1262         for(i=0; i<2; i++)
1263         {
1264             for(j=0; j<2; j++)
1265             {
1266                 if (debug)
1267                 {
1268                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1269                 }
1270                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1271                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1272             }
1273         }
1274     }
1275     for(d=1; d<dd->ndim; d++)
1276     {
1277         comm->cell_f_max0[d] = extr_s[d-1][0];
1278         comm->cell_f_min1[d] = extr_s[d-1][1];
1279         if (debug)
1280         {
1281             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1282                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1283         }
1284     }
1285 }
1286
1287 static void dd_collect_cg(gmx_domdec_t *dd,
1288                           t_state *state_local)
1289 {
1290     gmx_domdec_master_t *ma=NULL;
1291     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1292     t_block *cgs_gl;
1293
1294     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1295     {
1296         /* The master has the correct distribution */
1297         return;
1298     }
1299
1300     if (state_local->ddp_count == dd->ddp_count)
1301     {
1302         ncg_home = dd->ncg_home;
1303         cg       = dd->index_gl;
1304         nat_home = dd->nat_home;
1305     }
1306     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1307     {
1308         cgs_gl = &dd->comm->cgs_gl;
1309
1310         ncg_home = state_local->ncg_gl;
1311         cg       = state_local->cg_gl;
1312         nat_home = 0;
1313         for(i=0; i<ncg_home; i++)
1314         {
1315             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1316         }
1317     }
1318     else
1319     {
1320         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1321     }
1322
1323     buf2[0] = dd->ncg_home;
1324     buf2[1] = dd->nat_home;
1325     if (DDMASTER(dd))
1326     {
1327         ma = dd->ma;
1328         ibuf = ma->ibuf;
1329     }
1330     else
1331     {
1332         ibuf = NULL;
1333     }
1334     /* Collect the charge group and atom counts on the master */
1335     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1336
1337     if (DDMASTER(dd))
1338     {
1339         ma->index[0] = 0;
1340         for(i=0; i<dd->nnodes; i++)
1341         {
1342             ma->ncg[i] = ma->ibuf[2*i];
1343             ma->nat[i] = ma->ibuf[2*i+1];
1344             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1345
1346         }
1347         /* Make byte counts and indices */
1348         for(i=0; i<dd->nnodes; i++)
1349         {
1350             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1351             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1352         }
1353         if (debug)
1354         {
1355             fprintf(debug,"Initial charge group distribution: ");
1356             for(i=0; i<dd->nnodes; i++)
1357                 fprintf(debug," %d",ma->ncg[i]);
1358             fprintf(debug,"\n");
1359         }
1360     }
1361
1362     /* Collect the charge group indices on the master */
1363     dd_gatherv(dd,
1364                dd->ncg_home*sizeof(int),dd->index_gl,
1365                DDMASTER(dd) ? ma->ibuf : NULL,
1366                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1367                DDMASTER(dd) ? ma->cg : NULL);
1368
1369     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1370 }
1371
1372 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1373                                     rvec *lv,rvec *v)
1374 {
1375     gmx_domdec_master_t *ma;
1376     int  n,i,c,a,nalloc=0;
1377     rvec *buf=NULL;
1378     t_block *cgs_gl;
1379
1380     ma = dd->ma;
1381
1382     if (!DDMASTER(dd))
1383     {
1384 #ifdef GMX_MPI
1385         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1386                  dd->rank,dd->mpi_comm_all);
1387 #endif
1388     } else {
1389         /* Copy the master coordinates to the global array */
1390         cgs_gl = &dd->comm->cgs_gl;
1391
1392         n = DDMASTERRANK(dd);
1393         a = 0;
1394         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1395         {
1396             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1397             {
1398                 copy_rvec(lv[a++],v[c]);
1399             }
1400         }
1401
1402         for(n=0; n<dd->nnodes; n++)
1403         {
1404             if (n != dd->rank)
1405             {
1406                 if (ma->nat[n] > nalloc)
1407                 {
1408                     nalloc = over_alloc_dd(ma->nat[n]);
1409                     srenew(buf,nalloc);
1410                 }
1411 #ifdef GMX_MPI
1412                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1413                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1414 #endif
1415                 a = 0;
1416                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1417                 {
1418                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1419                     {
1420                         copy_rvec(buf[a++],v[c]);
1421                     }
1422                 }
1423             }
1424         }
1425         sfree(buf);
1426     }
1427 }
1428
1429 static void get_commbuffer_counts(gmx_domdec_t *dd,
1430                                   int **counts,int **disps)
1431 {
1432     gmx_domdec_master_t *ma;
1433     int n;
1434
1435     ma = dd->ma;
1436
1437     /* Make the rvec count and displacment arrays */
1438     *counts  = ma->ibuf;
1439     *disps   = ma->ibuf + dd->nnodes;
1440     for(n=0; n<dd->nnodes; n++)
1441     {
1442         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1443         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1444     }
1445 }
1446
1447 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1448                                    rvec *lv,rvec *v)
1449 {
1450     gmx_domdec_master_t *ma;
1451     int  *rcounts=NULL,*disps=NULL;
1452     int  n,i,c,a;
1453     rvec *buf=NULL;
1454     t_block *cgs_gl;
1455
1456     ma = dd->ma;
1457
1458     if (DDMASTER(dd))
1459     {
1460         get_commbuffer_counts(dd,&rcounts,&disps);
1461
1462         buf = ma->vbuf;
1463     }
1464
1465     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1466
1467     if (DDMASTER(dd))
1468     {
1469         cgs_gl = &dd->comm->cgs_gl;
1470
1471         a = 0;
1472         for(n=0; n<dd->nnodes; n++)
1473         {
1474             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1475             {
1476                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1477                 {
1478                     copy_rvec(buf[a++],v[c]);
1479                 }
1480             }
1481         }
1482     }
1483 }
1484
1485 void dd_collect_vec(gmx_domdec_t *dd,
1486                     t_state *state_local,rvec *lv,rvec *v)
1487 {
1488     gmx_domdec_master_t *ma;
1489     int  n,i,c,a,nalloc=0;
1490     rvec *buf=NULL;
1491
1492     dd_collect_cg(dd,state_local);
1493
1494     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1495     {
1496         dd_collect_vec_sendrecv(dd,lv,v);
1497     }
1498     else
1499     {
1500         dd_collect_vec_gatherv(dd,lv,v);
1501     }
1502 }
1503
1504
1505 void dd_collect_state(gmx_domdec_t *dd,
1506                       t_state *state_local,t_state *state)
1507 {
1508     int est,i,j,nh;
1509
1510     nh = state->nhchainlength;
1511
1512     if (DDMASTER(dd))
1513     {
1514         for (i=0;i<efptNR;i++) {
1515             state->lambda[i] = state_local->lambda[i];
1516         }
1517         state->fep_state = state_local->fep_state;
1518         state->veta = state_local->veta;
1519         state->vol0 = state_local->vol0;
1520         copy_mat(state_local->box,state->box);
1521         copy_mat(state_local->boxv,state->boxv);
1522         copy_mat(state_local->svir_prev,state->svir_prev);
1523         copy_mat(state_local->fvir_prev,state->fvir_prev);
1524         copy_mat(state_local->pres_prev,state->pres_prev);
1525
1526
1527         for(i=0; i<state_local->ngtc; i++)
1528         {
1529             for(j=0; j<nh; j++) {
1530                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1531                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1532             }
1533             state->therm_integral[i] = state_local->therm_integral[i];
1534         }
1535         for(i=0; i<state_local->nnhpres; i++)
1536         {
1537             for(j=0; j<nh; j++) {
1538                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1539                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1540             }
1541         }
1542     }
1543     for(est=0; est<estNR; est++)
1544     {
1545         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1546         {
1547             switch (est) {
1548             case estX:
1549                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1550                 break;
1551             case estV:
1552                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1553                 break;
1554             case estSDX:
1555                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1556                 break;
1557             case estCGP:
1558                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1559                 break;
1560             case estLD_RNG:
1561                 if (state->nrngi == 1)
1562                 {
1563                     if (DDMASTER(dd))
1564                     {
1565                         for(i=0; i<state_local->nrng; i++)
1566                         {
1567                             state->ld_rng[i] = state_local->ld_rng[i];
1568                         }
1569                     }
1570                 }
1571                 else
1572                 {
1573                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1574                               state_local->ld_rng,state->ld_rng);
1575                 }
1576                 break;
1577             case estLD_RNGI:
1578                 if (state->nrngi == 1)
1579                 {
1580                    if (DDMASTER(dd))
1581                     {
1582                         state->ld_rngi[0] = state_local->ld_rngi[0];
1583                     }
1584                 }
1585                 else
1586                 {
1587                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1588                               state_local->ld_rngi,state->ld_rngi);
1589                 }
1590                 break;
1591             case estDISRE_INITF:
1592             case estDISRE_RM3TAV:
1593             case estORIRE_INITF:
1594             case estORIRE_DTAV:
1595                 break;
1596             default:
1597                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1598             }
1599         }
1600     }
1601 }
1602
1603 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1604 {
1605     int est;
1606
1607     if (debug)
1608     {
1609         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1610     }
1611
1612     state->nalloc = over_alloc_dd(nalloc);
1613
1614     for(est=0; est<estNR; est++)
1615     {
1616         if (EST_DISTR(est) && (state->flags & (1<<est)))
1617         {
1618             switch(est) {
1619             case estX:
1620                 srenew(state->x,state->nalloc);
1621                 break;
1622             case estV:
1623                 srenew(state->v,state->nalloc);
1624                 break;
1625             case estSDX:
1626                 srenew(state->sd_X,state->nalloc);
1627                 break;
1628             case estCGP:
1629                 srenew(state->cg_p,state->nalloc);
1630                 break;
1631             case estLD_RNG:
1632             case estLD_RNGI:
1633             case estDISRE_INITF:
1634             case estDISRE_RM3TAV:
1635             case estORIRE_INITF:
1636             case estORIRE_DTAV:
1637                 /* No reallocation required */
1638                 break;
1639             default:
1640                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1641             }
1642         }
1643     }
1644
1645     if (f != NULL)
1646     {
1647         srenew(*f,state->nalloc);
1648     }
1649 }
1650
1651 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1652                                int nalloc)
1653 {
1654     if (nalloc > fr->cg_nalloc)
1655     {
1656         if (debug)
1657         {
1658             fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1659         }
1660         fr->cg_nalloc = over_alloc_dd(nalloc);
1661         srenew(fr->cginfo,fr->cg_nalloc);
1662         if (fr->cutoff_scheme == ecutsGROUP)
1663         {
1664             srenew(fr->cg_cm,fr->cg_nalloc);
1665         }
1666     }
1667     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1668     {
1669         /* We don't use charge groups, we use x in state to set up
1670          * the atom communication.
1671          */
1672         dd_realloc_state(state,f,nalloc);
1673     }
1674 }
1675
1676 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1677                                        rvec *v,rvec *lv)
1678 {
1679     gmx_domdec_master_t *ma;
1680     int  n,i,c,a,nalloc=0;
1681     rvec *buf=NULL;
1682
1683     if (DDMASTER(dd))
1684     {
1685         ma  = dd->ma;
1686
1687         for(n=0; n<dd->nnodes; n++)
1688         {
1689             if (n != dd->rank)
1690             {
1691                 if (ma->nat[n] > nalloc)
1692                 {
1693                     nalloc = over_alloc_dd(ma->nat[n]);
1694                     srenew(buf,nalloc);
1695                 }
1696                 /* Use lv as a temporary buffer */
1697                 a = 0;
1698                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1699                 {
1700                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1701                     {
1702                         copy_rvec(v[c],buf[a++]);
1703                     }
1704                 }
1705                 if (a != ma->nat[n])
1706                 {
1707                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1708                               a,ma->nat[n]);
1709                 }
1710
1711 #ifdef GMX_MPI
1712                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1713                          DDRANK(dd,n),n,dd->mpi_comm_all);
1714 #endif
1715             }
1716         }
1717         sfree(buf);
1718         n = DDMASTERRANK(dd);
1719         a = 0;
1720         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1721         {
1722             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1723             {
1724                 copy_rvec(v[c],lv[a++]);
1725             }
1726         }
1727     }
1728     else
1729     {
1730 #ifdef GMX_MPI
1731         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1732                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1733 #endif
1734     }
1735 }
1736
1737 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1738                                        rvec *v,rvec *lv)
1739 {
1740     gmx_domdec_master_t *ma;
1741     int  *scounts=NULL,*disps=NULL;
1742     int  n,i,c,a,nalloc=0;
1743     rvec *buf=NULL;
1744
1745     if (DDMASTER(dd))
1746     {
1747         ma  = dd->ma;
1748
1749         get_commbuffer_counts(dd,&scounts,&disps);
1750
1751         buf = ma->vbuf;
1752         a = 0;
1753         for(n=0; n<dd->nnodes; n++)
1754         {
1755             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1756             {
1757                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1758                 {
1759                     copy_rvec(v[c],buf[a++]);
1760                 }
1761             }
1762         }
1763     }
1764
1765     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1766 }
1767
1768 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1769 {
1770     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1771     {
1772         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1773     }
1774     else
1775     {
1776         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1777     }
1778 }
1779
1780 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1781                                 t_state *state,t_state *state_local,
1782                                 rvec **f)
1783 {
1784     int  i,j,nh;
1785
1786     nh = state->nhchainlength;
1787
1788     if (DDMASTER(dd))
1789     {
1790         for(i=0;i<efptNR;i++)
1791         {
1792             state_local->lambda[i] = state->lambda[i];
1793         }
1794         state_local->fep_state = state->fep_state;
1795         state_local->veta   = state->veta;
1796         state_local->vol0   = state->vol0;
1797         copy_mat(state->box,state_local->box);
1798         copy_mat(state->box_rel,state_local->box_rel);
1799         copy_mat(state->boxv,state_local->boxv);
1800         copy_mat(state->svir_prev,state_local->svir_prev);
1801         copy_mat(state->fvir_prev,state_local->fvir_prev);
1802         for(i=0; i<state_local->ngtc; i++)
1803         {
1804             for(j=0; j<nh; j++) {
1805                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1806                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1807             }
1808             state_local->therm_integral[i] = state->therm_integral[i];
1809         }
1810         for(i=0; i<state_local->nnhpres; i++)
1811         {
1812             for(j=0; j<nh; j++) {
1813                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1814                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1815             }
1816         }
1817     }
1818     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1819     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1820     dd_bcast(dd,sizeof(real),&state_local->veta);
1821     dd_bcast(dd,sizeof(real),&state_local->vol0);
1822     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1823     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1824     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1825     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1826     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1827     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1828     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1829     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1830     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1831     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1832
1833     if (dd->nat_home > state_local->nalloc)
1834     {
1835         dd_realloc_state(state_local,f,dd->nat_home);
1836     }
1837     for(i=0; i<estNR; i++)
1838     {
1839         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1840         {
1841             switch (i) {
1842             case estX:
1843                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1844                 break;
1845             case estV:
1846                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1847                 break;
1848             case estSDX:
1849                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1850                 break;
1851             case estCGP:
1852                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1853                 break;
1854             case estLD_RNG:
1855                 if (state->nrngi == 1)
1856                 {
1857                     dd_bcastc(dd,
1858                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1859                               state->ld_rng,state_local->ld_rng);
1860                 }
1861                 else
1862                 {
1863                     dd_scatter(dd,
1864                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1865                                state->ld_rng,state_local->ld_rng);
1866                 }
1867                 break;
1868             case estLD_RNGI:
1869                 if (state->nrngi == 1)
1870                 {
1871                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1872                               state->ld_rngi,state_local->ld_rngi);
1873                 }
1874                 else
1875                 {
1876                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1877                                state->ld_rngi,state_local->ld_rngi);
1878                 }
1879                 break;
1880             case estDISRE_INITF:
1881             case estDISRE_RM3TAV:
1882             case estORIRE_INITF:
1883             case estORIRE_DTAV:
1884                 /* Not implemented yet */
1885                 break;
1886             default:
1887                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1888             }
1889         }
1890     }
1891 }
1892
1893 static char dim2char(int dim)
1894 {
1895     char c='?';
1896
1897     switch (dim)
1898     {
1899     case XX: c = 'X'; break;
1900     case YY: c = 'Y'; break;
1901     case ZZ: c = 'Z'; break;
1902     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1903     }
1904
1905     return c;
1906 }
1907
1908 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1909                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1910 {
1911     rvec grid_s[2],*grid_r=NULL,cx,r;
1912     char fname[STRLEN],format[STRLEN],buf[22];
1913     FILE *out;
1914     int  a,i,d,z,y,x;
1915     matrix tric;
1916     real vol;
1917
1918     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1919     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1920
1921     if (DDMASTER(dd))
1922     {
1923         snew(grid_r,2*dd->nnodes);
1924     }
1925
1926     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1927
1928     if (DDMASTER(dd))
1929     {
1930         for(d=0; d<DIM; d++)
1931         {
1932             for(i=0; i<DIM; i++)
1933             {
1934                 if (d == i)
1935                 {
1936                     tric[d][i] = 1;
1937                 }
1938                 else
1939                 {
1940                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1941                     {
1942                         tric[d][i] = box[i][d]/box[i][i];
1943                     }
1944                     else
1945                     {
1946                         tric[d][i] = 0;
1947                     }
1948                 }
1949             }
1950         }
1951         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1952         sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1953         out = gmx_fio_fopen(fname,"w");
1954         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1955         a = 1;
1956         for(i=0; i<dd->nnodes; i++)
1957         {
1958             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1959             for(d=0; d<DIM; d++)
1960             {
1961                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1962             }
1963             for(z=0; z<2; z++)
1964             {
1965                 for(y=0; y<2; y++)
1966                 {
1967                     for(x=0; x<2; x++)
1968                     {
1969                         cx[XX] = grid_r[i*2+x][XX];
1970                         cx[YY] = grid_r[i*2+y][YY];
1971                         cx[ZZ] = grid_r[i*2+z][ZZ];
1972                         mvmul(tric,cx,r);
1973                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1974                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1975                     }
1976                 }
1977             }
1978             for(d=0; d<DIM; d++)
1979             {
1980                 for(x=0; x<4; x++)
1981                 {
1982                     switch(d)
1983                     {
1984                     case 0: y = 1 + i*8 + 2*x; break;
1985                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1986                     case 2: y = 1 + i*8 + x; break;
1987                     }
1988                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1989                 }
1990             }
1991         }
1992         gmx_fio_fclose(out);
1993         sfree(grid_r);
1994     }
1995 }
1996
1997 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1998                   gmx_mtop_t *mtop,t_commrec *cr,
1999                   int natoms,rvec x[],matrix box)
2000 {
2001     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
2002     FILE *out;
2003     int  i,ii,resnr,c;
2004     char *atomname,*resname;
2005     real b;
2006     gmx_domdec_t *dd;
2007
2008     dd = cr->dd;
2009     if (natoms == -1)
2010     {
2011         natoms = dd->comm->nat[ddnatVSITE];
2012     }
2013
2014     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2015
2016     sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
2017     sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
2018
2019     out = gmx_fio_fopen(fname,"w");
2020
2021     fprintf(out,"TITLE     %s\n",title);
2022     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2023     for(i=0; i<natoms; i++)
2024     {
2025         ii = dd->gatindex[i];
2026         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2027         if (i < dd->comm->nat[ddnatZONE])
2028         {
2029             c = 0;
2030             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2031             {
2032                 c++;
2033             }
2034             b = c;
2035         }
2036         else if (i < dd->comm->nat[ddnatVSITE])
2037         {
2038             b = dd->comm->zones.n;
2039         }
2040         else
2041         {
2042             b = dd->comm->zones.n + 1;
2043         }
2044         fprintf(out,strlen(atomname)<4 ? format : format4,
2045                 "ATOM",(ii+1)%100000,
2046                 atomname,resname,' ',resnr%10000,' ',
2047                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2048     }
2049     fprintf(out,"TER\n");
2050
2051     gmx_fio_fclose(out);
2052 }
2053
2054 real dd_cutoff_mbody(gmx_domdec_t *dd)
2055 {
2056     gmx_domdec_comm_t *comm;
2057     int  di;
2058     real r;
2059
2060     comm = dd->comm;
2061
2062     r = -1;
2063     if (comm->bInterCGBondeds)
2064     {
2065         if (comm->cutoff_mbody > 0)
2066         {
2067             r = comm->cutoff_mbody;
2068         }
2069         else
2070         {
2071             /* cutoff_mbody=0 means we do not have DLB */
2072             r = comm->cellsize_min[dd->dim[0]];
2073             for(di=1; di<dd->ndim; di++)
2074             {
2075                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2076             }
2077             if (comm->bBondComm)
2078             {
2079                 r = max(r,comm->cutoff_mbody);
2080             }
2081             else
2082             {
2083                 r = min(r,comm->cutoff);
2084             }
2085         }
2086     }
2087
2088     return r;
2089 }
2090
2091 real dd_cutoff_twobody(gmx_domdec_t *dd)
2092 {
2093     real r_mb;
2094
2095     r_mb = dd_cutoff_mbody(dd);
2096
2097     return max(dd->comm->cutoff,r_mb);
2098 }
2099
2100
2101 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2102 {
2103     int nc,ntot;
2104
2105     nc   = dd->nc[dd->comm->cartpmedim];
2106     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2107     copy_ivec(coord,coord_pme);
2108     coord_pme[dd->comm->cartpmedim] =
2109         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2110 }
2111
2112 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2113 {
2114     /* Here we assign a PME node to communicate with this DD node
2115      * by assuming that the major index of both is x.
2116      * We add cr->npmenodes/2 to obtain an even distribution.
2117      */
2118     return (ddindex*npme + npme/2)/ndd;
2119 }
2120
2121 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2122 {
2123     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2124 }
2125
2126 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2127 {
2128     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2129 }
2130
2131 static int *dd_pmenodes(t_commrec *cr)
2132 {
2133     int *pmenodes;
2134     int n,i,p0,p1;
2135
2136     snew(pmenodes,cr->npmenodes);
2137     n = 0;
2138     for(i=0; i<cr->dd->nnodes; i++) {
2139         p0 = cr_ddindex2pmeindex(cr,i);
2140         p1 = cr_ddindex2pmeindex(cr,i+1);
2141         if (i+1 == cr->dd->nnodes || p1 > p0) {
2142             if (debug)
2143                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2144             pmenodes[n] = i + 1 + n;
2145             n++;
2146         }
2147     }
2148
2149     return pmenodes;
2150 }
2151
2152 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2153 {
2154     gmx_domdec_t *dd;
2155     ivec coords,coords_pme,nc;
2156     int  slab;
2157
2158     dd = cr->dd;
2159     /*
2160       if (dd->comm->bCartesian) {
2161       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2162       dd_coords2pmecoords(dd,coords,coords_pme);
2163       copy_ivec(dd->ntot,nc);
2164       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2165       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2166
2167       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2168       } else {
2169       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2170       }
2171     */
2172     coords[XX] = x;
2173     coords[YY] = y;
2174     coords[ZZ] = z;
2175     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2176
2177     return slab;
2178 }
2179
2180 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2181 {
2182     gmx_domdec_comm_t *comm;
2183     ivec coords;
2184     int  ddindex,nodeid=-1;
2185
2186     comm = cr->dd->comm;
2187
2188     coords[XX] = x;
2189     coords[YY] = y;
2190     coords[ZZ] = z;
2191     if (comm->bCartesianPP_PME)
2192     {
2193 #ifdef GMX_MPI
2194         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2195 #endif
2196     }
2197     else
2198     {
2199         ddindex = dd_index(cr->dd->nc,coords);
2200         if (comm->bCartesianPP)
2201         {
2202             nodeid = comm->ddindex2simnodeid[ddindex];
2203         }
2204         else
2205         {
2206             if (comm->pmenodes)
2207             {
2208                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2209             }
2210             else
2211             {
2212                 nodeid = ddindex;
2213             }
2214         }
2215     }
2216
2217     return nodeid;
2218 }
2219
2220 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2221 {
2222     gmx_domdec_t *dd;
2223     gmx_domdec_comm_t *comm;
2224     ivec coord,coord_pme;
2225     int  i;
2226     int  pmenode=-1;
2227
2228     dd = cr->dd;
2229     comm = dd->comm;
2230
2231     /* This assumes a uniform x domain decomposition grid cell size */
2232     if (comm->bCartesianPP_PME)
2233     {
2234 #ifdef GMX_MPI
2235         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2236         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2237         {
2238             /* This is a PP node */
2239             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2240             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2241         }
2242 #endif
2243     }
2244     else if (comm->bCartesianPP)
2245     {
2246         if (sim_nodeid < dd->nnodes)
2247         {
2248             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2249         }
2250     }
2251     else
2252     {
2253         /* This assumes DD cells with identical x coordinates
2254          * are numbered sequentially.
2255          */
2256         if (dd->comm->pmenodes == NULL)
2257         {
2258             if (sim_nodeid < dd->nnodes)
2259             {
2260                 /* The DD index equals the nodeid */
2261                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2262             }
2263         }
2264         else
2265         {
2266             i = 0;
2267             while (sim_nodeid > dd->comm->pmenodes[i])
2268             {
2269                 i++;
2270             }
2271             if (sim_nodeid < dd->comm->pmenodes[i])
2272             {
2273                 pmenode = dd->comm->pmenodes[i];
2274             }
2275         }
2276     }
2277
2278     return pmenode;
2279 }
2280
2281 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2282 {
2283     gmx_bool bPMEOnlyNode;
2284
2285     if (DOMAINDECOMP(cr))
2286     {
2287         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2288     }
2289     else
2290     {
2291         bPMEOnlyNode = FALSE;
2292     }
2293
2294     return bPMEOnlyNode;
2295 }
2296
2297 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2298                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2299 {
2300     gmx_domdec_t *dd;
2301     int x,y,z;
2302     ivec coord,coord_pme;
2303
2304     dd = cr->dd;
2305
2306     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2307
2308     *nmy_ddnodes = 0;
2309     for(x=0; x<dd->nc[XX]; x++)
2310     {
2311         for(y=0; y<dd->nc[YY]; y++)
2312         {
2313             for(z=0; z<dd->nc[ZZ]; z++)
2314             {
2315                 if (dd->comm->bCartesianPP_PME)
2316                 {
2317                     coord[XX] = x;
2318                     coord[YY] = y;
2319                     coord[ZZ] = z;
2320                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2321                     if (dd->ci[XX] == coord_pme[XX] &&
2322                         dd->ci[YY] == coord_pme[YY] &&
2323                         dd->ci[ZZ] == coord_pme[ZZ])
2324                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2325                 }
2326                 else
2327                 {
2328                     /* The slab corresponds to the nodeid in the PME group */
2329                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2330                     {
2331                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2332                     }
2333                 }
2334             }
2335         }
2336     }
2337
2338     /* The last PP-only node is the peer node */
2339     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2340
2341     if (debug)
2342     {
2343         fprintf(debug,"Receive coordinates from PP nodes:");
2344         for(x=0; x<*nmy_ddnodes; x++)
2345         {
2346             fprintf(debug," %d",(*my_ddnodes)[x]);
2347         }
2348         fprintf(debug,"\n");
2349     }
2350 }
2351
2352 static gmx_bool receive_vir_ener(t_commrec *cr)
2353 {
2354     gmx_domdec_comm_t *comm;
2355     int  pmenode,coords[DIM],rank;
2356     gmx_bool bReceive;
2357
2358     bReceive = TRUE;
2359     if (cr->npmenodes < cr->dd->nnodes)
2360     {
2361         comm = cr->dd->comm;
2362         if (comm->bCartesianPP_PME)
2363         {
2364             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2365 #ifdef GMX_MPI
2366             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2367             coords[comm->cartpmedim]++;
2368             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2369             {
2370                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2371                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2372                 {
2373                     /* This is not the last PP node for pmenode */
2374                     bReceive = FALSE;
2375                 }
2376             }
2377 #endif
2378         }
2379         else
2380         {
2381             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2382             if (cr->sim_nodeid+1 < cr->nnodes &&
2383                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2384             {
2385                 /* This is not the last PP node for pmenode */
2386                 bReceive = FALSE;
2387             }
2388         }
2389     }
2390
2391     return bReceive;
2392 }
2393
2394 static void set_zones_ncg_home(gmx_domdec_t *dd)
2395 {
2396     gmx_domdec_zones_t *zones;
2397     int i;
2398
2399     zones = &dd->comm->zones;
2400
2401     zones->cg_range[0] = 0;
2402     for(i=1; i<zones->n+1; i++)
2403     {
2404         zones->cg_range[i] = dd->ncg_home;
2405     }
2406 }
2407
2408 static void rebuild_cgindex(gmx_domdec_t *dd,
2409                             const int *gcgs_index,t_state *state)
2410 {
2411     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2412
2413     ind = state->cg_gl;
2414     dd_cg_gl = dd->index_gl;
2415     cgindex  = dd->cgindex;
2416     nat = 0;
2417     cgindex[0] = nat;
2418     for(i=0; i<state->ncg_gl; i++)
2419     {
2420         cgindex[i] = nat;
2421         cg_gl = ind[i];
2422         dd_cg_gl[i] = cg_gl;
2423         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2424     }
2425     cgindex[i] = nat;
2426
2427     dd->ncg_home = state->ncg_gl;
2428     dd->nat_home = nat;
2429
2430     set_zones_ncg_home(dd);
2431 }
2432
2433 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2434 {
2435     while (cg >= cginfo_mb->cg_end)
2436     {
2437         cginfo_mb++;
2438     }
2439
2440     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2441 }
2442
2443 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2444                           t_forcerec *fr,char *bLocalCG)
2445 {
2446     cginfo_mb_t *cginfo_mb;
2447     int *cginfo;
2448     int cg;
2449
2450     if (fr != NULL)
2451     {
2452         cginfo_mb = fr->cginfo_mb;
2453         cginfo    = fr->cginfo;
2454
2455         for(cg=cg0; cg<cg1; cg++)
2456         {
2457             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2458         }
2459     }
2460
2461     if (bLocalCG != NULL)
2462     {
2463         for(cg=cg0; cg<cg1; cg++)
2464         {
2465             bLocalCG[index_gl[cg]] = TRUE;
2466         }
2467     }
2468 }
2469
2470 static void make_dd_indices(gmx_domdec_t *dd,
2471                             const int *gcgs_index,int cg_start)
2472 {
2473     int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2474     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2475     gmx_ga2la_t *ga2la;
2476     char *bLocalCG;
2477     gmx_bool bCGs;
2478
2479     bLocalCG = dd->comm->bLocalCG;
2480
2481     if (dd->nat_tot > dd->gatindex_nalloc)
2482     {
2483         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2484         srenew(dd->gatindex,dd->gatindex_nalloc);
2485     }
2486
2487     nzone      = dd->comm->zones.n;
2488     zone2cg    = dd->comm->zones.cg_range;
2489     zone_ncg1  = dd->comm->zone_ncg1;
2490     index_gl   = dd->index_gl;
2491     gatindex   = dd->gatindex;
2492     bCGs       = dd->comm->bCGs;
2493
2494     if (zone2cg[1] != dd->ncg_home)
2495     {
2496         gmx_incons("dd->ncg_zone is not up to date");
2497     }
2498
2499     /* Make the local to global and global to local atom index */
2500     a = dd->cgindex[cg_start];
2501     for(zone=0; zone<nzone; zone++)
2502     {
2503         if (zone == 0)
2504         {
2505             cg0 = cg_start;
2506         }
2507         else
2508         {
2509             cg0 = zone2cg[zone];
2510         }
2511         cg1    = zone2cg[zone+1];
2512         cg1_p1 = cg0 + zone_ncg1[zone];
2513
2514         for(cg=cg0; cg<cg1; cg++)
2515         {
2516             zone1 = zone;
2517             if (cg >= cg1_p1)
2518             {
2519                 /* Signal that this cg is from more than one pulse away */
2520                 zone1 += nzone;
2521             }
2522             cg_gl = index_gl[cg];
2523             if (bCGs)
2524             {
2525                 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2526                 {
2527                     gatindex[a] = a_gl;
2528                     ga2la_set(dd->ga2la,a_gl,a,zone1);
2529                     a++;
2530                 }
2531             }
2532             else
2533             {
2534                 gatindex[a] = cg_gl;
2535                 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2536                 a++;
2537             }
2538         }
2539     }
2540 }
2541
2542 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2543                           const char *where)
2544 {
2545     int ncg,i,ngl,nerr;
2546
2547     nerr = 0;
2548     if (bLocalCG == NULL)
2549     {
2550         return nerr;
2551     }
2552     for(i=0; i<dd->ncg_tot; i++)
2553     {
2554         if (!bLocalCG[dd->index_gl[i]])
2555         {
2556             fprintf(stderr,
2557                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2558             nerr++;
2559         }
2560     }
2561     ngl = 0;
2562     for(i=0; i<ncg_sys; i++)
2563     {
2564         if (bLocalCG[i])
2565         {
2566             ngl++;
2567         }
2568     }
2569     if (ngl != dd->ncg_tot)
2570     {
2571         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2572         nerr++;
2573     }
2574
2575     return nerr;
2576 }
2577
2578 static void check_index_consistency(gmx_domdec_t *dd,
2579                                     int natoms_sys,int ncg_sys,
2580                                     const char *where)
2581 {
2582     int  nerr,ngl,i,a,cell;
2583     int  *have;
2584
2585     nerr = 0;
2586
2587     if (dd->comm->DD_debug > 1)
2588     {
2589         snew(have,natoms_sys);
2590         for(a=0; a<dd->nat_tot; a++)
2591         {
2592             if (have[dd->gatindex[a]] > 0)
2593             {
2594                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2595             }
2596             else
2597             {
2598                 have[dd->gatindex[a]] = a + 1;
2599             }
2600         }
2601         sfree(have);
2602     }
2603
2604     snew(have,dd->nat_tot);
2605
2606     ngl  = 0;
2607     for(i=0; i<natoms_sys; i++)
2608     {
2609         if (ga2la_get(dd->ga2la,i,&a,&cell))
2610         {
2611             if (a >= dd->nat_tot)
2612             {
2613                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2614                 nerr++;
2615             }
2616             else
2617             {
2618                 have[a] = 1;
2619                 if (dd->gatindex[a] != i)
2620                 {
2621                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2622                     nerr++;
2623                 }
2624             }
2625             ngl++;
2626         }
2627     }
2628     if (ngl != dd->nat_tot)
2629     {
2630         fprintf(stderr,
2631                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2632                 dd->rank,where,ngl,dd->nat_tot);
2633     }
2634     for(a=0; a<dd->nat_tot; a++)
2635     {
2636         if (have[a] == 0)
2637         {
2638             fprintf(stderr,
2639                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2640                     dd->rank,where,a+1,dd->gatindex[a]+1);
2641         }
2642     }
2643     sfree(have);
2644
2645     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2646
2647     if (nerr > 0) {
2648         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2649                   dd->rank,where,nerr);
2650     }
2651 }
2652
2653 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2654 {
2655     int  i;
2656     char *bLocalCG;
2657
2658     if (a_start == 0)
2659     {
2660         /* Clear the whole list without searching */
2661         ga2la_clear(dd->ga2la);
2662     }
2663     else
2664     {
2665         for(i=a_start; i<dd->nat_tot; i++)
2666         {
2667             ga2la_del(dd->ga2la,dd->gatindex[i]);
2668         }
2669     }
2670
2671     bLocalCG = dd->comm->bLocalCG;
2672     if (bLocalCG)
2673     {
2674         for(i=cg_start; i<dd->ncg_tot; i++)
2675         {
2676             bLocalCG[dd->index_gl[i]] = FALSE;
2677         }
2678     }
2679
2680     dd_clear_local_vsite_indices(dd);
2681
2682     if (dd->constraints)
2683     {
2684         dd_clear_local_constraint_indices(dd);
2685     }
2686 }
2687
2688 /* This function should be used for moving the domain boudaries during DLB,
2689  * for obtaining the minimum cell size. It checks the initially set limit
2690  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2691  * and, possibly, a longer cut-off limit set for PME load balancing.
2692  */
2693 static real cellsize_min_dlb(gmx_domdec_comm_t *comm,int dim_ind,int dim)
2694 {
2695     real cellsize_min;
2696
2697     cellsize_min = comm->cellsize_min[dim];
2698
2699     if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
2700     {
2701         cellsize_min = max(cellsize_min,
2702                            comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2703     }
2704
2705     return cellsize_min;
2706 }
2707
2708 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2709                             int dim_ind)
2710 {
2711     real grid_jump_limit;
2712
2713     /* The distance between the boundaries of cells at distance
2714      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2715      * and by the fact that cells should not be shifted by more than
2716      * half their size, such that cg's only shift by one cell
2717      * at redecomposition.
2718      */
2719     grid_jump_limit = comm->cellsize_limit;
2720     if (!comm->bVacDLBNoLimit)
2721     {
2722         if (comm->bPMELoadBalDLBLimits)
2723         {
2724             cutoff = max(cutoff,comm->PMELoadBal_max_cutoff);
2725         }
2726         grid_jump_limit = max(grid_jump_limit,
2727                               cutoff/comm->cd[dim_ind].np);
2728     }
2729
2730     return grid_jump_limit;
2731 }
2732
2733 static gmx_bool check_grid_jump(gmx_large_int_t step,
2734                                 gmx_domdec_t *dd,
2735                                 real cutoff,
2736                                 gmx_ddbox_t *ddbox,
2737                                 gmx_bool bFatal)
2738 {
2739     gmx_domdec_comm_t *comm;
2740     int  d,dim;
2741     real limit,bfac;
2742     gmx_bool bInvalid;
2743
2744     bInvalid = FALSE;
2745
2746     comm = dd->comm;
2747
2748     for(d=1; d<dd->ndim; d++)
2749     {
2750         dim = dd->dim[d];
2751         limit = grid_jump_limit(comm,cutoff,d);
2752         bfac = ddbox->box_size[dim];
2753         if (ddbox->tric_dir[dim])
2754         {
2755             bfac *= ddbox->skew_fac[dim];
2756         }
2757         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2758             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2759         {
2760             bInvalid = TRUE;
2761
2762             if (bFatal)
2763             {
2764                 char buf[22];
2765
2766                 /* This error should never be triggered under normal
2767                  * circumstances, but you never know ...
2768                  */
2769                 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2770                           gmx_step_str(step,buf),
2771                           dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2772             }
2773         }
2774     }
2775
2776     return bInvalid;
2777 }
2778
2779 static int dd_load_count(gmx_domdec_comm_t *comm)
2780 {
2781     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2782 }
2783
2784 static float dd_force_load(gmx_domdec_comm_t *comm)
2785 {
2786     float load;
2787
2788     if (comm->eFlop)
2789     {
2790         load = comm->flop;
2791         if (comm->eFlop > 1)
2792         {
2793             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2794         }
2795     }
2796     else
2797     {
2798         load = comm->cycl[ddCyclF];
2799         if (comm->cycl_n[ddCyclF] > 1)
2800         {
2801             /* Subtract the maximum of the last n cycle counts
2802              * to get rid of possible high counts due to other soures,
2803              * for instance system activity, that would otherwise
2804              * affect the dynamic load balancing.
2805              */
2806             load -= comm->cycl_max[ddCyclF];
2807         }
2808     }
2809
2810     return load;
2811 }
2812
2813 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2814 {
2815     gmx_domdec_comm_t *comm;
2816     int i;
2817
2818     comm = dd->comm;
2819
2820     snew(*dim_f,dd->nc[dim]+1);
2821     (*dim_f)[0] = 0;
2822     for(i=1; i<dd->nc[dim]; i++)
2823     {
2824         if (comm->slb_frac[dim])
2825         {
2826             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2827         }
2828         else
2829         {
2830             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2831         }
2832     }
2833     (*dim_f)[dd->nc[dim]] = 1;
2834 }
2835
2836 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2837 {
2838     int  pmeindex,slab,nso,i;
2839     ivec xyz;
2840
2841     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2842     {
2843         ddpme->dim = YY;
2844     }
2845     else
2846     {
2847         ddpme->dim = dimind;
2848     }
2849     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2850
2851     ddpme->nslab = (ddpme->dim == 0 ?
2852                     dd->comm->npmenodes_x :
2853                     dd->comm->npmenodes_y);
2854
2855     if (ddpme->nslab <= 1)
2856     {
2857         return;
2858     }
2859
2860     nso = dd->comm->npmenodes/ddpme->nslab;
2861     /* Determine for each PME slab the PP location range for dimension dim */
2862     snew(ddpme->pp_min,ddpme->nslab);
2863     snew(ddpme->pp_max,ddpme->nslab);
2864     for(slab=0; slab<ddpme->nslab; slab++) {
2865         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2866         ddpme->pp_max[slab] = 0;
2867     }
2868     for(i=0; i<dd->nnodes; i++) {
2869         ddindex2xyz(dd->nc,i,xyz);
2870         /* For y only use our y/z slab.
2871          * This assumes that the PME x grid size matches the DD grid size.
2872          */
2873         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2874             pmeindex = ddindex2pmeindex(dd,i);
2875             if (dimind == 0) {
2876                 slab = pmeindex/nso;
2877             } else {
2878                 slab = pmeindex % ddpme->nslab;
2879             }
2880             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2881             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2882         }
2883     }
2884
2885     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2886 }
2887
2888 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2889 {
2890     if (dd->comm->ddpme[0].dim == XX)
2891     {
2892         return dd->comm->ddpme[0].maxshift;
2893     }
2894     else
2895     {
2896         return 0;
2897     }
2898 }
2899
2900 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2901 {
2902     if (dd->comm->ddpme[0].dim == YY)
2903     {
2904         return dd->comm->ddpme[0].maxshift;
2905     }
2906     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2907     {
2908         return dd->comm->ddpme[1].maxshift;
2909     }
2910     else
2911     {
2912         return 0;
2913     }
2914 }
2915
2916 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2917                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2918 {
2919     gmx_domdec_comm_t *comm;
2920     int  nc,ns,s;
2921     int  *xmin,*xmax;
2922     real range,pme_boundary;
2923     int  sh;
2924
2925     comm = dd->comm;
2926     nc  = dd->nc[ddpme->dim];
2927     ns  = ddpme->nslab;
2928
2929     if (!ddpme->dim_match)
2930     {
2931         /* PP decomposition is not along dim: the worst situation */
2932         sh = ns/2;
2933     }
2934     else if (ns <= 3 || (bUniform && ns == nc))
2935     {
2936         /* The optimal situation */
2937         sh = 1;
2938     }
2939     else
2940     {
2941         /* We need to check for all pme nodes which nodes they
2942          * could possibly need to communicate with.
2943          */
2944         xmin = ddpme->pp_min;
2945         xmax = ddpme->pp_max;
2946         /* Allow for atoms to be maximally 2/3 times the cut-off
2947          * out of their DD cell. This is a reasonable balance between
2948          * between performance and support for most charge-group/cut-off
2949          * combinations.
2950          */
2951         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2952         /* Avoid extra communication when we are exactly at a boundary */
2953         range *= 0.999;
2954
2955         sh = 1;
2956         for(s=0; s<ns; s++)
2957         {
2958             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2959             pme_boundary = (real)s/ns;
2960             while (sh+1 < ns &&
2961                    ((s-(sh+1) >= 0 &&
2962                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2963                     (s-(sh+1) <  0 &&
2964                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2965             {
2966                 sh++;
2967             }
2968             pme_boundary = (real)(s+1)/ns;
2969             while (sh+1 < ns &&
2970                    ((s+(sh+1) <  ns &&
2971                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2972                     (s+(sh+1) >= ns &&
2973                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2974             {
2975                 sh++;
2976             }
2977         }
2978     }
2979
2980     ddpme->maxshift = sh;
2981
2982     if (debug)
2983     {
2984         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2985                 ddpme->dim,ddpme->maxshift);
2986     }
2987 }
2988
2989 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2990 {
2991     int d,dim;
2992
2993     for(d=0; d<dd->ndim; d++)
2994     {
2995         dim = dd->dim[d];
2996         if (dim < ddbox->nboundeddim &&
2997             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2998             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2999         {
3000             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3001                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3002                       dd->nc[dim],dd->comm->cellsize_limit);
3003         }
3004     }
3005 }
3006
3007 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
3008                                   gmx_bool bMaster,ivec npulse)
3009 {
3010     gmx_domdec_comm_t *comm;
3011     int  d,j;
3012     rvec cellsize_min;
3013     real *cell_x,cell_dx,cellsize;
3014
3015     comm = dd->comm;
3016
3017     for(d=0; d<DIM; d++)
3018     {
3019         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3020         npulse[d] = 1;
3021         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3022         {
3023             /* Uniform grid */
3024             cell_dx = ddbox->box_size[d]/dd->nc[d];
3025             if (bMaster)
3026             {
3027                 for(j=0; j<dd->nc[d]+1; j++)
3028                 {
3029                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3030                 }
3031             }
3032             else
3033             {
3034                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3035                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3036             }
3037             cellsize = cell_dx*ddbox->skew_fac[d];
3038             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3039             {
3040                 npulse[d]++;
3041             }
3042             cellsize_min[d] = cellsize;
3043         }
3044         else
3045         {
3046             /* Statically load balanced grid */
3047             /* Also when we are not doing a master distribution we determine
3048              * all cell borders in a loop to obtain identical values
3049              * to the master distribution case and to determine npulse.
3050              */
3051             if (bMaster)
3052             {
3053                 cell_x = dd->ma->cell_x[d];
3054             }
3055             else
3056             {
3057                 snew(cell_x,dd->nc[d]+1);
3058             }
3059             cell_x[0] = ddbox->box0[d];
3060             for(j=0; j<dd->nc[d]; j++)
3061             {
3062                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3063                 cell_x[j+1] = cell_x[j] + cell_dx;
3064                 cellsize = cell_dx*ddbox->skew_fac[d];
3065                 while (cellsize*npulse[d] < comm->cutoff &&
3066                        npulse[d] < dd->nc[d]-1)
3067                 {
3068                     npulse[d]++;
3069                 }
3070                 cellsize_min[d] = min(cellsize_min[d],cellsize);
3071             }
3072             if (!bMaster)
3073             {
3074                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3075                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3076                 sfree(cell_x);
3077             }
3078         }
3079         /* The following limitation is to avoid that a cell would receive
3080          * some of its own home charge groups back over the periodic boundary.
3081          * Double charge groups cause trouble with the global indices.
3082          */
3083         if (d < ddbox->npbcdim &&
3084             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3085         {
3086             gmx_fatal_collective(FARGS,NULL,dd,
3087                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3088                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3089                                  comm->cutoff,
3090                                  dd->nc[d],dd->nc[d],
3091                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3092         }
3093     }
3094
3095     if (!comm->bDynLoadBal)
3096     {
3097         copy_rvec(cellsize_min,comm->cellsize_min);
3098     }
3099
3100     for(d=0; d<comm->npmedecompdim; d++)
3101     {
3102         set_pme_maxshift(dd,&comm->ddpme[d],
3103                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3104                          comm->ddpme[d].slb_dim_f);
3105     }
3106 }
3107
3108
3109 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3110                                        int d,int dim,gmx_domdec_root_t *root,
3111                                        gmx_ddbox_t *ddbox,
3112                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3113 {
3114     gmx_domdec_comm_t *comm;
3115     int  ncd,i,j,nmin,nmin_old;
3116     gmx_bool bLimLo,bLimHi;
3117     real *cell_size;
3118     real fac,halfway,cellsize_limit_f_i,region_size;
3119     gmx_bool bPBC,bLastHi=FALSE;
3120     int nrange[]={range[0],range[1]};
3121
3122     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
3123
3124     comm = dd->comm;
3125
3126     ncd = dd->nc[dim];
3127
3128     bPBC = (dim < ddbox->npbcdim);
3129
3130     cell_size = root->buf_ncd;
3131
3132     if (debug)
3133     {
3134         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3135     }
3136
3137     /* First we need to check if the scaling does not make cells
3138      * smaller than the smallest allowed size.
3139      * We need to do this iteratively, since if a cell is too small,
3140      * it needs to be enlarged, which makes all the other cells smaller,
3141      * which could in turn make another cell smaller than allowed.
3142      */
3143     for(i=range[0]; i<range[1]; i++)
3144     {
3145         root->bCellMin[i] = FALSE;
3146     }
3147     nmin = 0;
3148     do
3149     {
3150         nmin_old = nmin;
3151         /* We need the total for normalization */
3152         fac = 0;
3153         for(i=range[0]; i<range[1]; i++)
3154         {
3155             if (root->bCellMin[i] == FALSE)
3156             {
3157                 fac += cell_size[i];
3158             }
3159         }
3160         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3161         /* Determine the cell boundaries */
3162         for(i=range[0]; i<range[1]; i++)
3163         {
3164             if (root->bCellMin[i] == FALSE)
3165             {
3166                 cell_size[i] *= fac;
3167                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3168                 {
3169                     cellsize_limit_f_i = 0;
3170                 }
3171                 else
3172                 {
3173                     cellsize_limit_f_i = cellsize_limit_f;
3174                 }
3175                 if (cell_size[i] < cellsize_limit_f_i)
3176                 {
3177                     root->bCellMin[i] = TRUE;
3178                     cell_size[i] = cellsize_limit_f_i;
3179                     nmin++;
3180                 }
3181             }
3182             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3183         }
3184     }
3185     while (nmin > nmin_old);
3186
3187     i=range[1]-1;
3188     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3189     /* For this check we should not use DD_CELL_MARGIN,
3190      * but a slightly smaller factor,
3191      * since rounding could get use below the limit.
3192      */
3193     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3194     {
3195         char buf[22];
3196         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3197                   gmx_step_str(step,buf),
3198                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3199                   ncd,comm->cellsize_min[dim]);
3200     }
3201
3202     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3203
3204     if (!bUniform)
3205     {
3206         /* Check if the boundary did not displace more than halfway
3207          * each of the cells it bounds, as this could cause problems,
3208          * especially when the differences between cell sizes are large.
3209          * If changes are applied, they will not make cells smaller
3210          * than the cut-off, as we check all the boundaries which
3211          * might be affected by a change and if the old state was ok,
3212          * the cells will at most be shrunk back to their old size.
3213          */
3214         for(i=range[0]+1; i<range[1]; i++)
3215         {
3216             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3217             if (root->cell_f[i] < halfway)
3218             {
3219                 root->cell_f[i] = halfway;
3220                 /* Check if the change also causes shifts of the next boundaries */
3221                 for(j=i+1; j<range[1]; j++)
3222                 {
3223                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3224                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3225                 }
3226             }
3227             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3228             if (root->cell_f[i] > halfway)
3229             {
3230                 root->cell_f[i] = halfway;
3231                 /* Check if the change also causes shifts of the next boundaries */
3232                 for(j=i-1; j>=range[0]+1; j--)
3233                 {
3234                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3235                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3236                 }
3237             }
3238         }
3239     }
3240
3241     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3242     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3243      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3244      * for a and b nrange is used */
3245     if (d > 0)
3246     {
3247         /* Take care of the staggering of the cell boundaries */
3248         if (bUniform)
3249         {
3250             for(i=range[0]; i<range[1]; i++)
3251             {
3252                 root->cell_f_max0[i] = root->cell_f[i];
3253                 root->cell_f_min1[i] = root->cell_f[i+1];
3254             }
3255         }
3256         else
3257         {
3258             for(i=range[0]+1; i<range[1]; i++)
3259             {
3260                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3261                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3262                 if (bLimLo && bLimHi)
3263                 {
3264                     /* Both limits violated, try the best we can */
3265                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3266                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3267                     nrange[0]=range[0];
3268                     nrange[1]=i;
3269                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3270
3271                     nrange[0]=i;
3272                     nrange[1]=range[1];
3273                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3274
3275                     return;
3276                 }
3277                 else if (bLimLo)
3278                 {
3279                     /* root->cell_f[i] = root->bound_min[i]; */
3280                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3281                     bLastHi=FALSE;
3282                 }
3283                 else if (bLimHi && !bLastHi)
3284                 {
3285                     bLastHi=TRUE;
3286                     if (nrange[1] < range[1])   /* found a LimLo before */
3287                     {
3288                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3289                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3290                         nrange[0]=nrange[1];
3291                     }
3292                     root->cell_f[i] = root->bound_max[i];
3293                     nrange[1]=i;
3294                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3295                     nrange[0]=i;
3296                     nrange[1]=range[1];
3297                 }
3298             }
3299             if (nrange[1] < range[1])   /* found last a LimLo */
3300             {
3301                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3302                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3303                 nrange[0]=nrange[1];
3304                 nrange[1]=range[1];
3305                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3306             }
3307             else if (nrange[0] > range[0]) /* found at least one LimHi */
3308             {
3309                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3310             }
3311         }
3312     }
3313 }
3314
3315
3316 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3317                                        int d,int dim,gmx_domdec_root_t *root,
3318                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3319                                        gmx_bool bUniform,gmx_large_int_t step)
3320 {
3321     gmx_domdec_comm_t *comm;
3322     int  ncd,d1,i,j,pos;
3323     real *cell_size;
3324     real load_aver,load_i,imbalance,change,change_max,sc;
3325     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3326     real change_limit;
3327     real relax = 0.5;
3328     gmx_bool bPBC;
3329     int range[] = { 0, 0 };
3330
3331     comm = dd->comm;
3332
3333     /* Convert the maximum change from the input percentage to a fraction */
3334     change_limit = comm->dlb_scale_lim*0.01;
3335
3336     ncd = dd->nc[dim];
3337
3338     bPBC = (dim < ddbox->npbcdim);
3339
3340     cell_size = root->buf_ncd;
3341
3342     /* Store the original boundaries */
3343     for(i=0; i<ncd+1; i++)
3344     {
3345         root->old_cell_f[i] = root->cell_f[i];
3346     }
3347     if (bUniform) {
3348         for(i=0; i<ncd; i++)
3349         {
3350             cell_size[i] = 1.0/ncd;
3351         }
3352     }
3353     else if (dd_load_count(comm))
3354     {
3355         load_aver = comm->load[d].sum_m/ncd;
3356         change_max = 0;
3357         for(i=0; i<ncd; i++)
3358         {
3359             /* Determine the relative imbalance of cell i */
3360             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3361             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3362             /* Determine the change of the cell size using underrelaxation */
3363             change = -relax*imbalance;
3364             change_max = max(change_max,max(change,-change));
3365         }
3366         /* Limit the amount of scaling.
3367          * We need to use the same rescaling for all cells in one row,
3368          * otherwise the load balancing might not converge.
3369          */
3370         sc = relax;
3371         if (change_max > change_limit)
3372         {
3373             sc *= change_limit/change_max;
3374         }
3375         for(i=0; i<ncd; i++)
3376         {
3377             /* Determine the relative imbalance of cell i */
3378             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3379             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3380             /* Determine the change of the cell size using underrelaxation */
3381             change = -sc*imbalance;
3382             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3383         }
3384     }
3385
3386     cellsize_limit_f  = cellsize_min_dlb(comm,d,dim)/ddbox->box_size[dim];
3387     cellsize_limit_f *= DD_CELL_MARGIN;
3388     dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3389     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3390     if (ddbox->tric_dir[dim])
3391     {
3392         cellsize_limit_f /= ddbox->skew_fac[dim];
3393         dist_min_f       /= ddbox->skew_fac[dim];
3394     }
3395     if (bDynamicBox && d > 0)
3396     {
3397         dist_min_f *= DD_PRES_SCALE_MARGIN;
3398     }
3399     if (d > 0 && !bUniform)
3400     {
3401         /* Make sure that the grid is not shifted too much */
3402         for(i=1; i<ncd; i++) {
3403             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3404             {
3405                 gmx_incons("Inconsistent DD boundary staggering limits!");
3406             }
3407             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3408             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3409             if (space > 0) {
3410                 root->bound_min[i] += 0.5*space;
3411             }
3412             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3413             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3414             if (space < 0) {
3415                 root->bound_max[i] += 0.5*space;
3416             }
3417             if (debug)
3418             {
3419                 fprintf(debug,
3420                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3421                         d,i,
3422                         root->cell_f_max0[i-1] + dist_min_f,
3423                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3424                         root->cell_f_min1[i] - dist_min_f);
3425             }
3426         }
3427     }
3428     range[1]=ncd;
3429     root->cell_f[0] = 0;
3430     root->cell_f[ncd] = 1;
3431     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3432
3433
3434     /* After the checks above, the cells should obey the cut-off
3435      * restrictions, but it does not hurt to check.
3436      */
3437     for(i=0; i<ncd; i++)
3438     {
3439         if (debug)
3440         {
3441             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3442                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3443         }
3444
3445         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3446             root->cell_f[i+1] - root->cell_f[i] <
3447             cellsize_limit_f/DD_CELL_MARGIN)
3448         {
3449             char buf[22];
3450             fprintf(stderr,
3451                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3452                     gmx_step_str(step,buf),dim2char(dim),i,
3453                     (root->cell_f[i+1] - root->cell_f[i])
3454                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3455         }
3456     }
3457
3458     pos = ncd + 1;
3459     /* Store the cell boundaries of the lower dimensions at the end */
3460     for(d1=0; d1<d; d1++)
3461     {
3462         root->cell_f[pos++] = comm->cell_f0[d1];
3463         root->cell_f[pos++] = comm->cell_f1[d1];
3464     }
3465
3466     if (d < comm->npmedecompdim)
3467     {
3468         /* The master determines the maximum shift for
3469          * the coordinate communication between separate PME nodes.
3470          */
3471         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3472     }
3473     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3474     if (d >= 1)
3475     {
3476         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3477     }
3478 }
3479
3480 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3481                                              gmx_ddbox_t *ddbox,int dimind)
3482 {
3483     gmx_domdec_comm_t *comm;
3484     int dim;
3485
3486     comm = dd->comm;
3487
3488     /* Set the cell dimensions */
3489     dim = dd->dim[dimind];
3490     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3491     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3492     if (dim >= ddbox->nboundeddim)
3493     {
3494         comm->cell_x0[dim] += ddbox->box0[dim];
3495         comm->cell_x1[dim] += ddbox->box0[dim];
3496     }
3497 }
3498
3499 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3500                                          int d,int dim,real *cell_f_row,
3501                                          gmx_ddbox_t *ddbox)
3502 {
3503     gmx_domdec_comm_t *comm;
3504     int d1,dim1,pos;
3505
3506     comm = dd->comm;
3507
3508 #ifdef GMX_MPI
3509     /* Each node would only need to know two fractions,
3510      * but it is probably cheaper to broadcast the whole array.
3511      */
3512     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3513               0,comm->mpi_comm_load[d]);
3514 #endif
3515     /* Copy the fractions for this dimension from the buffer */
3516     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3517     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3518     /* The whole array was communicated, so set the buffer position */
3519     pos = dd->nc[dim] + 1;
3520     for(d1=0; d1<=d; d1++)
3521     {
3522         if (d1 < d)
3523         {
3524             /* Copy the cell fractions of the lower dimensions */
3525             comm->cell_f0[d1] = cell_f_row[pos++];
3526             comm->cell_f1[d1] = cell_f_row[pos++];
3527         }
3528         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3529     }
3530     /* Convert the communicated shift from float to int */
3531     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3532     if (d >= 1)
3533     {
3534         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3535     }
3536 }
3537
3538 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3539                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3540                                          gmx_bool bUniform,gmx_large_int_t step)
3541 {
3542     gmx_domdec_comm_t *comm;
3543     int d,dim,d1;
3544     gmx_bool bRowMember,bRowRoot;
3545     real *cell_f_row;
3546
3547     comm = dd->comm;
3548
3549     for(d=0; d<dd->ndim; d++)
3550     {
3551         dim = dd->dim[d];
3552         bRowMember = TRUE;
3553         bRowRoot = TRUE;
3554         for(d1=d; d1<dd->ndim; d1++)
3555         {
3556             if (dd->ci[dd->dim[d1]] > 0)
3557             {
3558                 if (d1 > d)
3559                 {
3560                     bRowMember = FALSE;
3561                 }
3562                 bRowRoot = FALSE;
3563             }
3564         }
3565         if (bRowMember)
3566         {
3567             if (bRowRoot)
3568             {
3569                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3570                                            ddbox,bDynamicBox,bUniform,step);
3571                 cell_f_row = comm->root[d]->cell_f;
3572             }
3573             else
3574             {
3575                 cell_f_row = comm->cell_f_row;
3576             }
3577             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3578         }
3579     }
3580 }
3581
3582 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3583 {
3584     int d;
3585
3586     /* This function assumes the box is static and should therefore
3587      * not be called when the box has changed since the last
3588      * call to dd_partition_system.
3589      */
3590     for(d=0; d<dd->ndim; d++)
3591     {
3592         relative_to_absolute_cell_bounds(dd,ddbox,d);
3593     }
3594 }
3595
3596
3597
3598 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3599                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3600                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3601                                   gmx_wallcycle_t wcycle)
3602 {
3603     gmx_domdec_comm_t *comm;
3604     int dim;
3605
3606     comm = dd->comm;
3607
3608     if (bDoDLB)
3609     {
3610         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3611         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3612         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3613     }
3614     else if (bDynamicBox)
3615     {
3616         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3617     }
3618
3619     /* Set the dimensions for which no DD is used */
3620     for(dim=0; dim<DIM; dim++) {
3621         if (dd->nc[dim] == 1) {
3622             comm->cell_x0[dim] = 0;
3623             comm->cell_x1[dim] = ddbox->box_size[dim];
3624             if (dim >= ddbox->nboundeddim)
3625             {
3626                 comm->cell_x0[dim] += ddbox->box0[dim];
3627                 comm->cell_x1[dim] += ddbox->box0[dim];
3628             }
3629         }
3630     }
3631 }
3632
3633 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3634 {
3635     int d,np,i;
3636     gmx_domdec_comm_dim_t *cd;
3637
3638     for(d=0; d<dd->ndim; d++)
3639     {
3640         cd = &dd->comm->cd[d];
3641         np = npulse[dd->dim[d]];
3642         if (np > cd->np_nalloc)
3643         {
3644             if (debug)
3645             {
3646                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3647                         dim2char(dd->dim[d]),np);
3648             }
3649             if (DDMASTER(dd) && cd->np_nalloc > 0)
3650             {
3651                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3652             }
3653             srenew(cd->ind,np);
3654             for(i=cd->np_nalloc; i<np; i++)
3655             {
3656                 cd->ind[i].index  = NULL;
3657                 cd->ind[i].nalloc = 0;
3658             }
3659             cd->np_nalloc = np;
3660         }
3661         cd->np = np;
3662     }
3663 }
3664
3665
3666 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3667                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3668                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3669                               gmx_wallcycle_t wcycle)
3670 {
3671     gmx_domdec_comm_t *comm;
3672     int  d;
3673     ivec npulse;
3674
3675     comm = dd->comm;
3676
3677     /* Copy the old cell boundaries for the cg displacement check */
3678     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3679     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3680
3681     if (comm->bDynLoadBal)
3682     {
3683         if (DDMASTER(dd))
3684         {
3685             check_box_size(dd,ddbox);
3686         }
3687         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3688     }
3689     else
3690     {
3691         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3692         realloc_comm_ind(dd,npulse);
3693     }
3694
3695     if (debug)
3696     {
3697         for(d=0; d<DIM; d++)
3698         {
3699             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3700                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3701         }
3702     }
3703 }
3704
3705 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3706                                   gmx_ddbox_t *ddbox,
3707                                   rvec cell_ns_x0,rvec cell_ns_x1,
3708                                   gmx_large_int_t step)
3709 {
3710     gmx_domdec_comm_t *comm;
3711     int dim_ind,dim;
3712
3713     comm = dd->comm;
3714
3715     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3716     {
3717         dim = dd->dim[dim_ind];
3718
3719         /* Without PBC we don't have restrictions on the outer cells */
3720         if (!(dim >= ddbox->npbcdim &&
3721               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3722             comm->bDynLoadBal &&
3723             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3724             comm->cellsize_min[dim])
3725         {
3726             char buf[22];
3727             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3728                       gmx_step_str(step,buf),dim2char(dim),
3729                       comm->cell_x1[dim] - comm->cell_x0[dim],
3730                       ddbox->skew_fac[dim],
3731                       dd->comm->cellsize_min[dim],
3732                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3733         }
3734     }
3735
3736     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3737     {
3738         /* Communicate the boundaries and update cell_ns_x0/1 */
3739         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3740         if (dd->bGridJump && dd->ndim > 1)
3741         {
3742             check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3743         }
3744     }
3745 }
3746
3747 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3748 {
3749     if (YY < npbcdim)
3750     {
3751         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3752     }
3753     else
3754     {
3755         tcm[YY][XX] = 0;
3756     }
3757     if (ZZ < npbcdim)
3758     {
3759         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3760         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3761     }
3762     else
3763     {
3764         tcm[ZZ][XX] = 0;
3765         tcm[ZZ][YY] = 0;
3766     }
3767 }
3768
3769 static void check_screw_box(matrix box)
3770 {
3771     /* Mathematical limitation */
3772     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3773     {
3774         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3775     }
3776
3777     /* Limitation due to the asymmetry of the eighth shell method */
3778     if (box[ZZ][YY] != 0)
3779     {
3780         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3781     }
3782 }
3783
3784 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3785                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3786                           gmx_domdec_t *dd)
3787 {
3788     gmx_domdec_master_t *ma;
3789     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3790     int  i,icg,j,k,k0,k1,d,npbcdim;
3791     matrix tcm;
3792     rvec box_size,cg_cm;
3793     ivec ind;
3794     real nrcg,inv_ncg,pos_d;
3795     atom_id *cgindex;
3796     gmx_bool bUnbounded,bScrew;
3797
3798     ma = dd->ma;
3799
3800     if (tmp_ind == NULL)
3801     {
3802         snew(tmp_nalloc,dd->nnodes);
3803         snew(tmp_ind,dd->nnodes);
3804         for(i=0; i<dd->nnodes; i++)
3805         {
3806             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3807             snew(tmp_ind[i],tmp_nalloc[i]);
3808         }
3809     }
3810
3811     /* Clear the count */
3812     for(i=0; i<dd->nnodes; i++)
3813     {
3814         ma->ncg[i] = 0;
3815         ma->nat[i] = 0;
3816     }
3817
3818     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3819
3820     cgindex = cgs->index;
3821
3822     /* Compute the center of geometry for all charge groups */
3823     for(icg=0; icg<cgs->nr; icg++)
3824     {
3825         k0      = cgindex[icg];
3826         k1      = cgindex[icg+1];
3827         nrcg    = k1 - k0;
3828         if (nrcg == 1)
3829         {
3830             copy_rvec(pos[k0],cg_cm);
3831         }
3832         else
3833         {
3834             inv_ncg = 1.0/nrcg;
3835
3836             clear_rvec(cg_cm);
3837             for(k=k0; (k<k1); k++)
3838             {
3839                 rvec_inc(cg_cm,pos[k]);
3840             }
3841             for(d=0; (d<DIM); d++)
3842             {
3843                 cg_cm[d] *= inv_ncg;
3844             }
3845         }
3846         /* Put the charge group in the box and determine the cell index */
3847         for(d=DIM-1; d>=0; d--) {
3848             pos_d = cg_cm[d];
3849             if (d < dd->npbcdim)
3850             {
3851                 bScrew = (dd->bScrewPBC && d == XX);
3852                 if (tric_dir[d] && dd->nc[d] > 1)
3853                 {
3854                     /* Use triclinic coordintates for this dimension */
3855                     for(j=d+1; j<DIM; j++)
3856                     {
3857                         pos_d += cg_cm[j]*tcm[j][d];
3858                     }
3859                 }
3860                 while(pos_d >= box[d][d])
3861                 {
3862                     pos_d -= box[d][d];
3863                     rvec_dec(cg_cm,box[d]);
3864                     if (bScrew)
3865                     {
3866                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3867                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3868                     }
3869                     for(k=k0; (k<k1); k++)
3870                     {
3871                         rvec_dec(pos[k],box[d]);
3872                         if (bScrew)
3873                         {
3874                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3875                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3876                         }
3877                     }
3878                 }
3879                 while(pos_d < 0)
3880                 {
3881                     pos_d += box[d][d];
3882                     rvec_inc(cg_cm,box[d]);
3883                     if (bScrew)
3884                     {
3885                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3886                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3887                     }
3888                     for(k=k0; (k<k1); k++)
3889                     {
3890                         rvec_inc(pos[k],box[d]);
3891                         if (bScrew) {
3892                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3893                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3894                         }
3895                     }
3896                 }
3897             }
3898             /* This could be done more efficiently */
3899             ind[d] = 0;
3900             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3901             {
3902                 ind[d]++;
3903             }
3904         }
3905         i = dd_index(dd->nc,ind);
3906         if (ma->ncg[i] == tmp_nalloc[i])
3907         {
3908             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3909             srenew(tmp_ind[i],tmp_nalloc[i]);
3910         }
3911         tmp_ind[i][ma->ncg[i]] = icg;
3912         ma->ncg[i]++;
3913         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3914     }
3915
3916     k1 = 0;
3917     for(i=0; i<dd->nnodes; i++)
3918     {
3919         ma->index[i] = k1;
3920         for(k=0; k<ma->ncg[i]; k++)
3921         {
3922             ma->cg[k1++] = tmp_ind[i][k];
3923         }
3924     }
3925     ma->index[dd->nnodes] = k1;
3926
3927     for(i=0; i<dd->nnodes; i++)
3928     {
3929         sfree(tmp_ind[i]);
3930     }
3931     sfree(tmp_ind);
3932     sfree(tmp_nalloc);
3933
3934     if (fplog)
3935     {
3936         char buf[22];
3937         fprintf(fplog,"Charge group distribution at step %s:",
3938                 gmx_step_str(step,buf));
3939         for(i=0; i<dd->nnodes; i++)
3940         {
3941             fprintf(fplog," %d",ma->ncg[i]);
3942         }
3943         fprintf(fplog,"\n");
3944     }
3945 }
3946
3947 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3948                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3949                                 rvec pos[])
3950 {
3951     gmx_domdec_master_t *ma=NULL;
3952     ivec npulse;
3953     int  i,cg_gl;
3954     int  *ibuf,buf2[2] = { 0, 0 };
3955     gmx_bool bMaster = DDMASTER(dd);
3956     if (bMaster)
3957     {
3958         ma = dd->ma;
3959
3960         if (dd->bScrewPBC)
3961         {
3962             check_screw_box(box);
3963         }
3964
3965         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3966
3967         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3968         for(i=0; i<dd->nnodes; i++)
3969         {
3970             ma->ibuf[2*i]   = ma->ncg[i];
3971             ma->ibuf[2*i+1] = ma->nat[i];
3972         }
3973         ibuf = ma->ibuf;
3974     }
3975     else
3976     {
3977         ibuf = NULL;
3978     }
3979     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3980
3981     dd->ncg_home = buf2[0];
3982     dd->nat_home = buf2[1];
3983     dd->ncg_tot  = dd->ncg_home;
3984     dd->nat_tot  = dd->nat_home;
3985     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3986     {
3987         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3988         srenew(dd->index_gl,dd->cg_nalloc);
3989         srenew(dd->cgindex,dd->cg_nalloc+1);
3990     }
3991     if (bMaster)
3992     {
3993         for(i=0; i<dd->nnodes; i++)
3994         {
3995             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3996             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3997         }
3998     }
3999
4000     dd_scatterv(dd,
4001                 DDMASTER(dd) ? ma->ibuf : NULL,
4002                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4003                 DDMASTER(dd) ? ma->cg : NULL,
4004                 dd->ncg_home*sizeof(int),dd->index_gl);
4005
4006     /* Determine the home charge group sizes */
4007     dd->cgindex[0] = 0;
4008     for(i=0; i<dd->ncg_home; i++)
4009     {
4010         cg_gl = dd->index_gl[i];
4011         dd->cgindex[i+1] =
4012             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4013     }
4014
4015     if (debug)
4016     {
4017         fprintf(debug,"Home charge groups:\n");
4018         for(i=0; i<dd->ncg_home; i++)
4019         {
4020             fprintf(debug," %d",dd->index_gl[i]);
4021             if (i % 10 == 9)
4022                 fprintf(debug,"\n");
4023         }
4024         fprintf(debug,"\n");
4025     }
4026 }
4027
4028 static int compact_and_copy_vec_at(int ncg,int *move,
4029                                    int *cgindex,
4030                                    int nvec,int vec,
4031                                    rvec *src,gmx_domdec_comm_t *comm,
4032                                    gmx_bool bCompact)
4033 {
4034     int m,icg,i,i0,i1,nrcg;
4035     int home_pos;
4036     int pos_vec[DIM*2];
4037
4038     home_pos = 0;
4039
4040     for(m=0; m<DIM*2; m++)
4041     {
4042         pos_vec[m] = 0;
4043     }
4044
4045     i0 = 0;
4046     for(icg=0; icg<ncg; icg++)
4047     {
4048         i1 = cgindex[icg+1];
4049         m = move[icg];
4050         if (m == -1)
4051         {
4052             if (bCompact)
4053             {
4054                 /* Compact the home array in place */
4055                 for(i=i0; i<i1; i++)
4056                 {
4057                     copy_rvec(src[i],src[home_pos++]);
4058                 }
4059             }
4060         }
4061         else
4062         {
4063             /* Copy to the communication buffer */
4064             nrcg = i1 - i0;
4065             pos_vec[m] += 1 + vec*nrcg;
4066             for(i=i0; i<i1; i++)
4067             {
4068                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4069             }
4070             pos_vec[m] += (nvec - vec - 1)*nrcg;
4071         }
4072         if (!bCompact)
4073         {
4074             home_pos += i1 - i0;
4075         }
4076         i0 = i1;
4077     }
4078
4079     return home_pos;
4080 }
4081
4082 static int compact_and_copy_vec_cg(int ncg,int *move,
4083                                    int *cgindex,
4084                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
4085                                    gmx_bool bCompact)
4086 {
4087     int m,icg,i0,i1,nrcg;
4088     int home_pos;
4089     int pos_vec[DIM*2];
4090
4091     home_pos = 0;
4092
4093     for(m=0; m<DIM*2; m++)
4094     {
4095         pos_vec[m] = 0;
4096     }
4097
4098     i0 = 0;
4099     for(icg=0; icg<ncg; icg++)
4100     {
4101         i1 = cgindex[icg+1];
4102         m = move[icg];
4103         if (m == -1)
4104         {
4105             if (bCompact)
4106             {
4107                 /* Compact the home array in place */
4108                 copy_rvec(src[icg],src[home_pos++]);
4109             }
4110         }
4111         else
4112         {
4113             nrcg = i1 - i0;
4114             /* Copy to the communication buffer */
4115             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4116             pos_vec[m] += 1 + nrcg*nvec;
4117         }
4118         i0 = i1;
4119     }
4120     if (!bCompact)
4121     {
4122         home_pos = ncg;
4123     }
4124
4125     return home_pos;
4126 }
4127
4128 static int compact_ind(int ncg,int *move,
4129                        int *index_gl,int *cgindex,
4130                        int *gatindex,
4131                        gmx_ga2la_t ga2la,char *bLocalCG,
4132                        int *cginfo)
4133 {
4134     int cg,nat,a0,a1,a,a_gl;
4135     int home_pos;
4136
4137     home_pos = 0;
4138     nat = 0;
4139     for(cg=0; cg<ncg; cg++)
4140     {
4141         a0 = cgindex[cg];
4142         a1 = cgindex[cg+1];
4143         if (move[cg] == -1)
4144         {
4145             /* Compact the home arrays in place.
4146              * Anything that can be done here avoids access to global arrays.
4147              */
4148             cgindex[home_pos] = nat;
4149             for(a=a0; a<a1; a++)
4150             {
4151                 a_gl = gatindex[a];
4152                 gatindex[nat] = a_gl;
4153                 /* The cell number stays 0, so we don't need to set it */
4154                 ga2la_change_la(ga2la,a_gl,nat);
4155                 nat++;
4156             }
4157             index_gl[home_pos] = index_gl[cg];
4158             cginfo[home_pos]   = cginfo[cg];
4159             /* The charge group remains local, so bLocalCG does not change */
4160             home_pos++;
4161         }
4162         else
4163         {
4164             /* Clear the global indices */
4165             for(a=a0; a<a1; a++)
4166             {
4167                 ga2la_del(ga2la,gatindex[a]);
4168             }
4169             if (bLocalCG)
4170             {
4171                 bLocalCG[index_gl[cg]] = FALSE;
4172             }
4173         }
4174     }
4175     cgindex[home_pos] = nat;
4176
4177     return home_pos;
4178 }
4179
4180 static void clear_and_mark_ind(int ncg,int *move,
4181                                int *index_gl,int *cgindex,int *gatindex,
4182                                gmx_ga2la_t ga2la,char *bLocalCG,
4183                                int *cell_index)
4184 {
4185     int cg,a0,a1,a;
4186
4187     for(cg=0; cg<ncg; cg++)
4188     {
4189         if (move[cg] >= 0)
4190         {
4191             a0 = cgindex[cg];
4192             a1 = cgindex[cg+1];
4193             /* Clear the global indices */
4194             for(a=a0; a<a1; a++)
4195             {
4196                 ga2la_del(ga2la,gatindex[a]);
4197             }
4198             if (bLocalCG)
4199             {
4200                 bLocalCG[index_gl[cg]] = FALSE;
4201             }
4202             /* Signal that this cg has moved using the ns cell index.
4203              * Here we set it to -1. fill_grid will change it
4204              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4205              */
4206             cell_index[cg] = -1;
4207         }
4208     }
4209 }
4210
4211 static void print_cg_move(FILE *fplog,
4212                           gmx_domdec_t *dd,
4213                           gmx_large_int_t step,int cg,int dim,int dir,
4214                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4215                           rvec cm_old,rvec cm_new,real pos_d)
4216 {
4217     gmx_domdec_comm_t *comm;
4218     char buf[22];
4219
4220     comm = dd->comm;
4221
4222     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4223     if (bHaveLimitdAndCMOld)
4224     {
4225         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4226                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4227     }
4228     else
4229     {
4230         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4231                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4232     }
4233     fprintf(fplog,"distance out of cell %f\n",
4234             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4235     if (bHaveLimitdAndCMOld)
4236     {
4237         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4238                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4239     }
4240     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4241             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4242     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4243             dim2char(dim),
4244             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4245     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4246             dim2char(dim),
4247             comm->cell_x0[dim],comm->cell_x1[dim]);
4248 }
4249
4250 static void cg_move_error(FILE *fplog,
4251                           gmx_domdec_t *dd,
4252                           gmx_large_int_t step,int cg,int dim,int dir,
4253                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4254                           rvec cm_old,rvec cm_new,real pos_d)
4255 {
4256     if (fplog)
4257     {
4258         print_cg_move(fplog, dd,step,cg,dim,dir,
4259                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4260     }
4261     print_cg_move(stderr,dd,step,cg,dim,dir,
4262                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4263     gmx_fatal(FARGS,
4264               "A charge group moved too far between two domain decomposition steps\n"
4265               "This usually means that your system is not well equilibrated");
4266 }
4267
4268 static void rotate_state_atom(t_state *state,int a)
4269 {
4270     int est;
4271
4272     for(est=0; est<estNR; est++)
4273     {
4274         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4275             switch (est) {
4276             case estX:
4277                 /* Rotate the complete state; for a rectangular box only */
4278                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4279                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4280                 break;
4281             case estV:
4282                 state->v[a][YY] = -state->v[a][YY];
4283                 state->v[a][ZZ] = -state->v[a][ZZ];
4284                 break;
4285             case estSDX:
4286                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4287                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4288                 break;
4289             case estCGP:
4290                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4291                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4292                 break;
4293             case estDISRE_INITF:
4294             case estDISRE_RM3TAV:
4295             case estORIRE_INITF:
4296             case estORIRE_DTAV:
4297                 /* These are distances, so not affected by rotation */
4298                 break;
4299             default:
4300                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4301             }
4302         }
4303     }
4304 }
4305
4306 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4307 {
4308     if (natoms > comm->moved_nalloc)
4309     {
4310         /* Contents should be preserved here */
4311         comm->moved_nalloc = over_alloc_dd(natoms);
4312         srenew(comm->moved,comm->moved_nalloc);
4313     }
4314
4315     return comm->moved;
4316 }
4317
4318 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4319                          gmx_domdec_t *dd,
4320                          t_state *state,
4321                          ivec tric_dir,matrix tcm,
4322                          rvec cell_x0,rvec cell_x1,
4323                          rvec limitd,rvec limit0,rvec limit1,
4324                          const int *cgindex,
4325                          int cg_start,int cg_end,
4326                          rvec *cg_cm,
4327                          int *move)
4328 {
4329     int  npbcdim;
4330     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4331     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4332     int  flag;
4333     gmx_bool bScrew;
4334     ivec dev;
4335     real inv_ncg,pos_d;
4336     rvec cm_new;
4337
4338     npbcdim = dd->npbcdim;
4339
4340     for(cg=cg_start; cg<cg_end; cg++)
4341     {
4342         k0   = cgindex[cg];
4343         k1   = cgindex[cg+1];
4344         nrcg = k1 - k0;
4345         if (nrcg == 1)
4346         {
4347             copy_rvec(state->x[k0],cm_new);
4348         }
4349         else
4350         {
4351             inv_ncg = 1.0/nrcg;
4352
4353             clear_rvec(cm_new);
4354             for(k=k0; (k<k1); k++)
4355             {
4356                 rvec_inc(cm_new,state->x[k]);
4357             }
4358             for(d=0; (d<DIM); d++)
4359             {
4360                 cm_new[d] = inv_ncg*cm_new[d];
4361             }
4362         }
4363
4364         clear_ivec(dev);
4365         /* Do pbc and check DD cell boundary crossings */
4366         for(d=DIM-1; d>=0; d--)
4367         {
4368             if (dd->nc[d] > 1)
4369             {
4370                 bScrew = (dd->bScrewPBC && d == XX);
4371                 /* Determine the location of this cg in lattice coordinates */
4372                 pos_d = cm_new[d];
4373                 if (tric_dir[d])
4374                 {
4375                     for(d2=d+1; d2<DIM; d2++)
4376                     {
4377                         pos_d += cm_new[d2]*tcm[d2][d];
4378                     }
4379                 }
4380                 /* Put the charge group in the triclinic unit-cell */
4381                 if (pos_d >= cell_x1[d])
4382                 {
4383                     if (pos_d >= limit1[d])
4384                     {
4385                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4386                                       cg_cm[cg],cm_new,pos_d);
4387                     }
4388                     dev[d] = 1;
4389                     if (dd->ci[d] == dd->nc[d] - 1)
4390                     {
4391                         rvec_dec(cm_new,state->box[d]);
4392                         if (bScrew)
4393                         {
4394                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4395                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4396                         }
4397                         for(k=k0; (k<k1); k++)
4398                         {
4399                             rvec_dec(state->x[k],state->box[d]);
4400                             if (bScrew)
4401                             {
4402                                 rotate_state_atom(state,k);
4403                             }
4404                         }
4405                     }
4406                 }
4407                 else if (pos_d < cell_x0[d])
4408                 {
4409                     if (pos_d < limit0[d])
4410                     {
4411                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4412                                       cg_cm[cg],cm_new,pos_d);
4413                     }
4414                     dev[d] = -1;
4415                     if (dd->ci[d] == 0)
4416                     {
4417                         rvec_inc(cm_new,state->box[d]);
4418                         if (bScrew)
4419                         {
4420                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4421                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4422                         }
4423                         for(k=k0; (k<k1); k++)
4424                         {
4425                             rvec_inc(state->x[k],state->box[d]);
4426                             if (bScrew)
4427                             {
4428                                 rotate_state_atom(state,k);
4429                             }
4430                         }
4431                     }
4432                 }
4433             }
4434             else if (d < npbcdim)
4435             {
4436                 /* Put the charge group in the rectangular unit-cell */
4437                 while (cm_new[d] >= state->box[d][d])
4438                 {
4439                     rvec_dec(cm_new,state->box[d]);
4440                     for(k=k0; (k<k1); k++)
4441                     {
4442                         rvec_dec(state->x[k],state->box[d]);
4443                     }
4444                 }
4445                 while (cm_new[d] < 0)
4446                 {
4447                     rvec_inc(cm_new,state->box[d]);
4448                     for(k=k0; (k<k1); k++)
4449                     {
4450                         rvec_inc(state->x[k],state->box[d]);
4451                     }
4452                 }
4453             }
4454         }
4455
4456         copy_rvec(cm_new,cg_cm[cg]);
4457
4458         /* Determine where this cg should go */
4459         flag = 0;
4460         mc = -1;
4461         for(d=0; d<dd->ndim; d++)
4462         {
4463             dim = dd->dim[d];
4464             if (dev[dim] == 1)
4465             {
4466                 flag |= DD_FLAG_FW(d);
4467                 if (mc == -1)
4468                 {
4469                     mc = d*2;
4470                 }
4471             }
4472             else if (dev[dim] == -1)
4473             {
4474                 flag |= DD_FLAG_BW(d);
4475                 if (mc == -1) {
4476                     if (dd->nc[dim] > 2)
4477                     {
4478                         mc = d*2 + 1;
4479                     }
4480                     else
4481                     {
4482                         mc = d*2;
4483                     }
4484                 }
4485             }
4486         }
4487         /* Temporarily store the flag in move */
4488         move[cg] = mc + flag;
4489     }
4490 }
4491
4492 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4493                                gmx_domdec_t *dd,ivec tric_dir,
4494                                t_state *state,rvec **f,
4495                                t_forcerec *fr,t_mdatoms *md,
4496                                gmx_bool bCompact,
4497                                t_nrnb *nrnb,
4498                                int *ncg_stay_home,
4499                                int *ncg_moved)
4500 {
4501     int  *move;
4502     int  npbcdim;
4503     int  ncg[DIM*2],nat[DIM*2];
4504     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4505     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4506     int  sbuf[2],rbuf[2];
4507     int  home_pos_cg,home_pos_at,buf_pos;
4508     int  flag;
4509     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4510     gmx_bool bScrew;
4511     ivec dev;
4512     real inv_ncg,pos_d;
4513     matrix tcm;
4514     rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4515     atom_id *cgindex;
4516     cginfo_mb_t *cginfo_mb;
4517     gmx_domdec_comm_t *comm;
4518     int  *moved;
4519     int  nthread,thread;
4520
4521     if (dd->bScrewPBC)
4522     {
4523         check_screw_box(state->box);
4524     }
4525
4526     comm  = dd->comm;
4527     if (fr->cutoff_scheme == ecutsGROUP)
4528     {
4529         cg_cm = fr->cg_cm;
4530     }
4531
4532     for(i=0; i<estNR; i++)
4533     {
4534         if (EST_DISTR(i))
4535         {
4536             switch (i)
4537             {
4538             case estX:   /* Always present */            break;
4539             case estV:   bV   = (state->flags & (1<<i)); break;
4540             case estSDX: bSDX = (state->flags & (1<<i)); break;
4541             case estCGP: bCGP = (state->flags & (1<<i)); break;
4542             case estLD_RNG:
4543             case estLD_RNGI:
4544             case estDISRE_INITF:
4545             case estDISRE_RM3TAV:
4546             case estORIRE_INITF:
4547             case estORIRE_DTAV:
4548                 /* No processing required */
4549                 break;
4550             default:
4551             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4552             }
4553         }
4554     }
4555
4556     if (dd->ncg_tot > comm->nalloc_int)
4557     {
4558         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4559         srenew(comm->buf_int,comm->nalloc_int);
4560     }
4561     move = comm->buf_int;
4562
4563     /* Clear the count */
4564     for(c=0; c<dd->ndim*2; c++)
4565     {
4566         ncg[c] = 0;
4567         nat[c] = 0;
4568     }
4569
4570     npbcdim = dd->npbcdim;
4571
4572     for(d=0; (d<DIM); d++)
4573     {
4574         limitd[d] = dd->comm->cellsize_min[d];
4575         if (d >= npbcdim && dd->ci[d] == 0)
4576         {
4577             cell_x0[d] = -GMX_FLOAT_MAX;
4578         }
4579         else
4580         {
4581             cell_x0[d] = comm->cell_x0[d];
4582         }
4583         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4584         {
4585             cell_x1[d] = GMX_FLOAT_MAX;
4586         }
4587         else
4588         {
4589             cell_x1[d] = comm->cell_x1[d];
4590         }
4591         if (d < npbcdim)
4592         {
4593             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4594             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4595         }
4596         else
4597         {
4598             /* We check after communication if a charge group moved
4599              * more than one cell. Set the pre-comm check limit to float_max.
4600              */
4601             limit0[d] = -GMX_FLOAT_MAX;
4602             limit1[d] =  GMX_FLOAT_MAX;
4603         }
4604     }
4605
4606     make_tric_corr_matrix(npbcdim,state->box,tcm);
4607
4608     cgindex = dd->cgindex;
4609
4610     nthread = gmx_omp_nthreads_get(emntDomdec);
4611
4612     /* Compute the center of geometry for all home charge groups
4613      * and put them in the box and determine where they should go.
4614      */
4615 #pragma omp parallel for num_threads(nthread) schedule(static)
4616     for(thread=0; thread<nthread; thread++)
4617     {
4618         calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4619                      cell_x0,cell_x1,limitd,limit0,limit1,
4620                      cgindex,
4621                      ( thread   *dd->ncg_home)/nthread,
4622                      ((thread+1)*dd->ncg_home)/nthread,
4623                      fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4624                      move);
4625     }
4626
4627     for(cg=0; cg<dd->ncg_home; cg++)
4628     {
4629         if (move[cg] >= 0)
4630         {
4631             mc = move[cg];
4632             flag     = mc & ~DD_FLAG_NRCG;
4633             mc       = mc & DD_FLAG_NRCG;
4634             move[cg] = mc;
4635
4636             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4637             {
4638                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4639                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4640             }
4641             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4642             /* We store the cg size in the lower 16 bits
4643              * and the place where the charge group should go
4644              * in the next 6 bits. This saves some communication volume.
4645              */
4646             nrcg = cgindex[cg+1] - cgindex[cg];
4647             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4648             ncg[mc] += 1;
4649             nat[mc] += nrcg;
4650         }
4651     }
4652
4653     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4654     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4655
4656     *ncg_moved = 0;
4657     for(i=0; i<dd->ndim*2; i++)
4658     {
4659         *ncg_moved += ncg[i];
4660     }
4661
4662     nvec = 1;
4663     if (bV)
4664     {
4665         nvec++;
4666     }
4667     if (bSDX)
4668     {
4669         nvec++;
4670     }
4671     if (bCGP)
4672     {
4673         nvec++;
4674     }
4675
4676     /* Make sure the communication buffers are large enough */
4677     for(mc=0; mc<dd->ndim*2; mc++)
4678     {
4679         nvr = ncg[mc] + nat[mc]*nvec;
4680         if (nvr > comm->cgcm_state_nalloc[mc])
4681         {
4682             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4683             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4684         }
4685     }
4686
4687     switch (fr->cutoff_scheme)
4688     {
4689     case ecutsGROUP:
4690         /* Recalculating cg_cm might be cheaper than communicating,
4691          * but that could give rise to rounding issues.
4692          */
4693         home_pos_cg =
4694             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4695                                     nvec,cg_cm,comm,bCompact);
4696     break;
4697     case ecutsVERLET:
4698         /* Without charge groups we send the moved atom coordinates
4699          * over twice. This is so the code below can be used without
4700          * many conditionals for both for with and without charge groups.
4701          */
4702         home_pos_cg =
4703             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4704                                     nvec,state->x,comm,FALSE);
4705         if (bCompact)
4706         {
4707             home_pos_cg -= *ncg_moved;
4708         }
4709         break;
4710     default:
4711         gmx_incons("unimplemented");
4712         home_pos_cg = 0;
4713     }
4714
4715     vec = 0;
4716     home_pos_at =
4717         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4718                                 nvec,vec++,state->x,comm,bCompact);
4719     if (bV)
4720     {
4721         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4722                                 nvec,vec++,state->v,comm,bCompact);
4723     }
4724     if (bSDX)
4725     {
4726         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4727                                 nvec,vec++,state->sd_X,comm,bCompact);
4728     }
4729     if (bCGP)
4730     {
4731         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4732                                 nvec,vec++,state->cg_p,comm,bCompact);
4733     }
4734
4735     if (bCompact)
4736     {
4737         compact_ind(dd->ncg_home,move,
4738                     dd->index_gl,dd->cgindex,dd->gatindex,
4739                     dd->ga2la,comm->bLocalCG,
4740                     fr->cginfo);
4741     }
4742     else
4743     {
4744         if (fr->cutoff_scheme == ecutsVERLET)
4745         {
4746             moved = get_moved(comm,dd->ncg_home);
4747
4748             for(k=0; k<dd->ncg_home; k++)
4749             {
4750                 moved[k] = 0;
4751             }
4752         }
4753         else
4754         {
4755             moved = fr->ns.grid->cell_index;
4756         }
4757
4758         clear_and_mark_ind(dd->ncg_home,move,
4759                            dd->index_gl,dd->cgindex,dd->gatindex,
4760                            dd->ga2la,comm->bLocalCG,
4761                            moved);
4762     }
4763
4764     cginfo_mb = fr->cginfo_mb;
4765
4766     *ncg_stay_home = home_pos_cg;
4767     for(d=0; d<dd->ndim; d++)
4768     {
4769         dim = dd->dim[d];
4770         ncg_recv = 0;
4771         nat_recv = 0;
4772         nvr      = 0;
4773         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4774         {
4775             cdd = d*2 + dir;
4776             /* Communicate the cg and atom counts */
4777             sbuf[0] = ncg[cdd];
4778             sbuf[1] = nat[cdd];
4779             if (debug)
4780             {
4781                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4782                         d,dir,sbuf[0],sbuf[1]);
4783             }
4784             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4785
4786             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4787             {
4788                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4789                 srenew(comm->buf_int,comm->nalloc_int);
4790             }
4791
4792             /* Communicate the charge group indices, sizes and flags */
4793             dd_sendrecv_int(dd, d, dir,
4794                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4795                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4796
4797             nvs = ncg[cdd] + nat[cdd]*nvec;
4798             i   = rbuf[0]  + rbuf[1] *nvec;
4799             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4800
4801             /* Communicate cgcm and state */
4802             dd_sendrecv_rvec(dd, d, dir,
4803                              comm->cgcm_state[cdd], nvs,
4804                              comm->vbuf.v+nvr, i);
4805             ncg_recv += rbuf[0];
4806             nat_recv += rbuf[1];
4807             nvr      += i;
4808         }
4809
4810         /* Process the received charge groups */
4811         buf_pos = 0;
4812         for(cg=0; cg<ncg_recv; cg++)
4813         {
4814             flag = comm->buf_int[cg*DD_CGIBS+1];
4815
4816             if (dim >= npbcdim && dd->nc[dim] > 2)
4817             {
4818                 /* No pbc in this dim and more than one domain boundary.
4819                  * We do a separate check if a charge group didn't move too far.
4820                  */
4821                 if (((flag & DD_FLAG_FW(d)) &&
4822                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4823                     ((flag & DD_FLAG_BW(d)) &&
4824                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4825                 {
4826                     cg_move_error(fplog,dd,step,cg,dim,
4827                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4828                                    FALSE,0,
4829                                    comm->vbuf.v[buf_pos],
4830                                    comm->vbuf.v[buf_pos],
4831                                    comm->vbuf.v[buf_pos][dim]);
4832                 }
4833             }
4834
4835             mc = -1;
4836             if (d < dd->ndim-1)
4837             {
4838                 /* Check which direction this cg should go */
4839                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4840                 {
4841                     if (dd->bGridJump)
4842                     {
4843                         /* The cell boundaries for dimension d2 are not equal
4844                          * for each cell row of the lower dimension(s),
4845                          * therefore we might need to redetermine where
4846                          * this cg should go.
4847                          */
4848                         dim2 = dd->dim[d2];
4849                         /* If this cg crosses the box boundary in dimension d2
4850                          * we can use the communicated flag, so we do not
4851                          * have to worry about pbc.
4852                          */
4853                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4854                                (flag & DD_FLAG_FW(d2))) ||
4855                               (dd->ci[dim2] == 0 &&
4856                                (flag & DD_FLAG_BW(d2)))))
4857                         {
4858                             /* Clear the two flags for this dimension */
4859                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4860                             /* Determine the location of this cg
4861                              * in lattice coordinates
4862                              */
4863                             pos_d = comm->vbuf.v[buf_pos][dim2];
4864                             if (tric_dir[dim2])
4865                             {
4866                                 for(d3=dim2+1; d3<DIM; d3++)
4867                                 {
4868                                     pos_d +=
4869                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4870                                 }
4871                             }
4872                             /* Check of we are not at the box edge.
4873                              * pbc is only handled in the first step above,
4874                              * but this check could move over pbc while
4875                              * the first step did not due to different rounding.
4876                              */
4877                             if (pos_d >= cell_x1[dim2] &&
4878                                 dd->ci[dim2] != dd->nc[dim2]-1)
4879                             {
4880                                 flag |= DD_FLAG_FW(d2);
4881                             }
4882                             else if (pos_d < cell_x0[dim2] &&
4883                                      dd->ci[dim2] != 0)
4884                             {
4885                                 flag |= DD_FLAG_BW(d2);
4886                             }
4887                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4888                         }
4889                     }
4890                     /* Set to which neighboring cell this cg should go */
4891                     if (flag & DD_FLAG_FW(d2))
4892                     {
4893                         mc = d2*2;
4894                     }
4895                     else if (flag & DD_FLAG_BW(d2))
4896                     {
4897                         if (dd->nc[dd->dim[d2]] > 2)
4898                         {
4899                             mc = d2*2+1;
4900                         }
4901                         else
4902                         {
4903                             mc = d2*2;
4904                         }
4905                     }
4906                 }
4907             }
4908
4909             nrcg = flag & DD_FLAG_NRCG;
4910             if (mc == -1)
4911             {
4912                 if (home_pos_cg+1 > dd->cg_nalloc)
4913                 {
4914                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4915                     srenew(dd->index_gl,dd->cg_nalloc);
4916                     srenew(dd->cgindex,dd->cg_nalloc+1);
4917                 }
4918                 /* Set the global charge group index and size */
4919                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4920                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4921                 /* Copy the state from the buffer */
4922                 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4923                 if (fr->cutoff_scheme == ecutsGROUP)
4924                 {
4925                     cg_cm = fr->cg_cm;
4926                     copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4927                 }
4928                 buf_pos++;
4929
4930                 /* Set the cginfo */
4931                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4932                                                    dd->index_gl[home_pos_cg]);
4933                 if (comm->bLocalCG)
4934                 {
4935                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4936                 }
4937
4938                 if (home_pos_at+nrcg > state->nalloc)
4939                 {
4940                     dd_realloc_state(state,f,home_pos_at+nrcg);
4941                 }
4942                 for(i=0; i<nrcg; i++)
4943                 {
4944                     copy_rvec(comm->vbuf.v[buf_pos++],
4945                               state->x[home_pos_at+i]);
4946                 }
4947                 if (bV)
4948                 {
4949                     for(i=0; i<nrcg; i++)
4950                     {
4951                         copy_rvec(comm->vbuf.v[buf_pos++],
4952                                   state->v[home_pos_at+i]);
4953                     }
4954                 }
4955                 if (bSDX)
4956                 {
4957                     for(i=0; i<nrcg; i++)
4958                     {
4959                         copy_rvec(comm->vbuf.v[buf_pos++],
4960                                   state->sd_X[home_pos_at+i]);
4961                     }
4962                 }
4963                 if (bCGP)
4964                 {
4965                     for(i=0; i<nrcg; i++)
4966                     {
4967                         copy_rvec(comm->vbuf.v[buf_pos++],
4968                                   state->cg_p[home_pos_at+i]);
4969                     }
4970                 }
4971                 home_pos_cg += 1;
4972                 home_pos_at += nrcg;
4973             }
4974             else
4975             {
4976                 /* Reallocate the buffers if necessary  */
4977                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4978                 {
4979                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4980                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4981                 }
4982                 nvr = ncg[mc] + nat[mc]*nvec;
4983                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4984                 {
4985                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4986                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4987                 }
4988                 /* Copy from the receive to the send buffers */
4989                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4990                        comm->buf_int + cg*DD_CGIBS,
4991                        DD_CGIBS*sizeof(int));
4992                 memcpy(comm->cgcm_state[mc][nvr],
4993                        comm->vbuf.v[buf_pos],
4994                        (1+nrcg*nvec)*sizeof(rvec));
4995                 buf_pos += 1 + nrcg*nvec;
4996                 ncg[mc] += 1;
4997                 nat[mc] += nrcg;
4998             }
4999         }
5000     }
5001
5002     /* With sorting (!bCompact) the indices are now only partially up to date
5003      * and ncg_home and nat_home are not the real count, since there are
5004      * "holes" in the arrays for the charge groups that moved to neighbors.
5005      */
5006     if (fr->cutoff_scheme == ecutsVERLET)
5007     {
5008         moved = get_moved(comm,home_pos_cg);
5009
5010         for(i=dd->ncg_home; i<home_pos_cg; i++)
5011         {
5012             moved[i] = 0;
5013         }
5014     }
5015     dd->ncg_home = home_pos_cg;
5016     dd->nat_home = home_pos_at;
5017
5018     if (debug)
5019     {
5020         fprintf(debug,
5021                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5022                 *ncg_moved,dd->ncg_home-*ncg_moved);
5023
5024     }
5025 }
5026
5027 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
5028 {
5029     dd->comm->cycl[ddCycl] += cycles;
5030     dd->comm->cycl_n[ddCycl]++;
5031     if (cycles > dd->comm->cycl_max[ddCycl])
5032     {
5033         dd->comm->cycl_max[ddCycl] = cycles;
5034     }
5035 }
5036
5037 static double force_flop_count(t_nrnb *nrnb)
5038 {
5039     int i;
5040     double sum;
5041     const char *name;
5042
5043     sum = 0;
5044     for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5045     {
5046         /* To get closer to the real timings, we half the count
5047          * for the normal loops and again half it for water loops.
5048          */
5049         name = nrnb_str(i);
5050         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5051         {
5052             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5053         }
5054         else
5055         {
5056             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5057         }
5058     }
5059     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5060     {
5061         name = nrnb_str(i);
5062         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5063         sum += nrnb->n[i]*cost_nrnb(i);
5064     }
5065     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5066     {
5067         sum += nrnb->n[i]*cost_nrnb(i);
5068     }
5069
5070     return sum;
5071 }
5072
5073 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5074 {
5075     if (dd->comm->eFlop)
5076     {
5077         dd->comm->flop -= force_flop_count(nrnb);
5078     }
5079 }
5080 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5081 {
5082     if (dd->comm->eFlop)
5083     {
5084         dd->comm->flop += force_flop_count(nrnb);
5085         dd->comm->flop_n++;
5086     }
5087 }
5088
5089 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5090 {
5091     int i;
5092
5093     for(i=0; i<ddCyclNr; i++)
5094     {
5095         dd->comm->cycl[i] = 0;
5096         dd->comm->cycl_n[i] = 0;
5097         dd->comm->cycl_max[i] = 0;
5098     }
5099     dd->comm->flop = 0;
5100     dd->comm->flop_n = 0;
5101 }
5102
5103 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5104 {
5105     gmx_domdec_comm_t *comm;
5106     gmx_domdec_load_t *load;
5107     gmx_domdec_root_t *root=NULL;
5108     int  d,dim,cid,i,pos;
5109     float cell_frac=0,sbuf[DD_NLOAD_MAX];
5110     gmx_bool bSepPME;
5111
5112     if (debug)
5113     {
5114         fprintf(debug,"get_load_distribution start\n");
5115     }
5116
5117     wallcycle_start(wcycle,ewcDDCOMMLOAD);
5118
5119     comm = dd->comm;
5120
5121     bSepPME = (dd->pme_nodeid >= 0);
5122
5123     for(d=dd->ndim-1; d>=0; d--)
5124     {
5125         dim = dd->dim[d];
5126         /* Check if we participate in the communication in this dimension */
5127         if (d == dd->ndim-1 ||
5128             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5129         {
5130             load = &comm->load[d];
5131             if (dd->bGridJump)
5132             {
5133                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5134             }
5135             pos = 0;
5136             if (d == dd->ndim-1)
5137             {
5138                 sbuf[pos++] = dd_force_load(comm);
5139                 sbuf[pos++] = sbuf[0];
5140                 if (dd->bGridJump)
5141                 {
5142                     sbuf[pos++] = sbuf[0];
5143                     sbuf[pos++] = cell_frac;
5144                     if (d > 0)
5145                     {
5146                         sbuf[pos++] = comm->cell_f_max0[d];
5147                         sbuf[pos++] = comm->cell_f_min1[d];
5148                     }
5149                 }
5150                 if (bSepPME)
5151                 {
5152                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5153                     sbuf[pos++] = comm->cycl[ddCyclPME];
5154                 }
5155             }
5156             else
5157             {
5158                 sbuf[pos++] = comm->load[d+1].sum;
5159                 sbuf[pos++] = comm->load[d+1].max;
5160                 if (dd->bGridJump)
5161                 {
5162                     sbuf[pos++] = comm->load[d+1].sum_m;
5163                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5164                     sbuf[pos++] = comm->load[d+1].flags;
5165                     if (d > 0)
5166                     {
5167                         sbuf[pos++] = comm->cell_f_max0[d];
5168                         sbuf[pos++] = comm->cell_f_min1[d];
5169                     }
5170                 }
5171                 if (bSepPME)
5172                 {
5173                     sbuf[pos++] = comm->load[d+1].mdf;
5174                     sbuf[pos++] = comm->load[d+1].pme;
5175                 }
5176             }
5177             load->nload = pos;
5178             /* Communicate a row in DD direction d.
5179              * The communicators are setup such that the root always has rank 0.
5180              */
5181 #ifdef GMX_MPI
5182             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
5183                        load->load,load->nload*sizeof(float),MPI_BYTE,
5184                        0,comm->mpi_comm_load[d]);
5185 #endif
5186             if (dd->ci[dim] == dd->master_ci[dim])
5187             {
5188                 /* We are the root, process this row */
5189                 if (comm->bDynLoadBal)
5190                 {
5191                     root = comm->root[d];
5192                 }
5193                 load->sum = 0;
5194                 load->max = 0;
5195                 load->sum_m = 0;
5196                 load->cvol_min = 1;
5197                 load->flags = 0;
5198                 load->mdf = 0;
5199                 load->pme = 0;
5200                 pos = 0;
5201                 for(i=0; i<dd->nc[dim]; i++)
5202                 {
5203                     load->sum += load->load[pos++];
5204                     load->max = max(load->max,load->load[pos]);
5205                     pos++;
5206                     if (dd->bGridJump)
5207                     {
5208                         if (root->bLimited)
5209                         {
5210                             /* This direction could not be load balanced properly,
5211                              * therefore we need to use the maximum iso the average load.
5212                              */
5213                             load->sum_m = max(load->sum_m,load->load[pos]);
5214                         }
5215                         else
5216                         {
5217                             load->sum_m += load->load[pos];
5218                         }
5219                         pos++;
5220                         load->cvol_min = min(load->cvol_min,load->load[pos]);
5221                         pos++;
5222                         if (d < dd->ndim-1)
5223                         {
5224                             load->flags = (int)(load->load[pos++] + 0.5);
5225                         }
5226                         if (d > 0)
5227                         {
5228                             root->cell_f_max0[i] = load->load[pos++];
5229                             root->cell_f_min1[i] = load->load[pos++];
5230                         }
5231                     }
5232                     if (bSepPME)
5233                     {
5234                         load->mdf = max(load->mdf,load->load[pos]);
5235                         pos++;
5236                         load->pme = max(load->pme,load->load[pos]);
5237                         pos++;
5238                     }
5239                 }
5240                 if (comm->bDynLoadBal && root->bLimited)
5241                 {
5242                     load->sum_m *= dd->nc[dim];
5243                     load->flags |= (1<<d);
5244                 }
5245             }
5246         }
5247     }
5248
5249     if (DDMASTER(dd))
5250     {
5251         comm->nload      += dd_load_count(comm);
5252         comm->load_step  += comm->cycl[ddCyclStep];
5253         comm->load_sum   += comm->load[0].sum;
5254         comm->load_max   += comm->load[0].max;
5255         if (comm->bDynLoadBal)
5256         {
5257             for(d=0; d<dd->ndim; d++)
5258             {
5259                 if (comm->load[0].flags & (1<<d))
5260                 {
5261                     comm->load_lim[d]++;
5262                 }
5263             }
5264         }
5265         if (bSepPME)
5266         {
5267             comm->load_mdf += comm->load[0].mdf;
5268             comm->load_pme += comm->load[0].pme;
5269         }
5270     }
5271
5272     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5273
5274     if (debug)
5275     {
5276         fprintf(debug,"get_load_distribution finished\n");
5277     }
5278 }
5279
5280 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5281 {
5282     /* Return the relative performance loss on the total run time
5283      * due to the force calculation load imbalance.
5284      */
5285     if (dd->comm->nload > 0)
5286     {
5287         return
5288             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5289             (dd->comm->load_step*dd->nnodes);
5290     }
5291     else
5292     {
5293         return 0;
5294     }
5295 }
5296
5297 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5298 {
5299     char  buf[STRLEN];
5300     int   npp,npme,nnodes,d,limp;
5301     float imbal,pme_f_ratio,lossf,lossp=0;
5302     gmx_bool  bLim;
5303     gmx_domdec_comm_t *comm;
5304
5305     comm = dd->comm;
5306     if (DDMASTER(dd) && comm->nload > 0)
5307     {
5308         npp    = dd->nnodes;
5309         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5310         nnodes = npp + npme;
5311         imbal = comm->load_max*npp/comm->load_sum - 1;
5312         lossf = dd_force_imb_perf_loss(dd);
5313         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5314         fprintf(fplog,"%s",buf);
5315         fprintf(stderr,"\n");
5316         fprintf(stderr,"%s",buf);
5317         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5318         fprintf(fplog,"%s",buf);
5319         fprintf(stderr,"%s",buf);
5320         bLim = FALSE;
5321         if (comm->bDynLoadBal)
5322         {
5323             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5324             for(d=0; d<dd->ndim; d++)
5325             {
5326                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5327                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5328                 if (limp >= 50)
5329                 {
5330                     bLim = TRUE;
5331                 }
5332             }
5333             sprintf(buf+strlen(buf),"\n");
5334             fprintf(fplog,"%s",buf);
5335             fprintf(stderr,"%s",buf);
5336         }
5337         if (npme > 0)
5338         {
5339             pme_f_ratio = comm->load_pme/comm->load_mdf;
5340             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5341             if (lossp <= 0)
5342             {
5343                 lossp *= (float)npme/(float)nnodes;
5344             }
5345             else
5346             {
5347                 lossp *= (float)npp/(float)nnodes;
5348             }
5349             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5350             fprintf(fplog,"%s",buf);
5351             fprintf(stderr,"%s",buf);
5352             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5353             fprintf(fplog,"%s",buf);
5354             fprintf(stderr,"%s",buf);
5355         }
5356         fprintf(fplog,"\n");
5357         fprintf(stderr,"\n");
5358
5359         if (lossf >= DD_PERF_LOSS)
5360         {
5361             sprintf(buf,
5362                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5363                     "      in the domain decomposition.\n",lossf*100);
5364             if (!comm->bDynLoadBal)
5365             {
5366                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5367             }
5368             else if (bLim)
5369             {
5370                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5371             }
5372             fprintf(fplog,"%s\n",buf);
5373             fprintf(stderr,"%s\n",buf);
5374         }
5375         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5376         {
5377             sprintf(buf,
5378                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5379                     "      had %s work to do than the PP nodes.\n"
5380                     "      You might want to %s the number of PME nodes\n"
5381                     "      or %s the cut-off and the grid spacing.\n",
5382                     fabs(lossp*100),
5383                     (lossp < 0) ? "less"     : "more",
5384                     (lossp < 0) ? "decrease" : "increase",
5385                     (lossp < 0) ? "decrease" : "increase");
5386             fprintf(fplog,"%s\n",buf);
5387             fprintf(stderr,"%s\n",buf);
5388         }
5389     }
5390 }
5391
5392 static float dd_vol_min(gmx_domdec_t *dd)
5393 {
5394     return dd->comm->load[0].cvol_min*dd->nnodes;
5395 }
5396
5397 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5398 {
5399     return dd->comm->load[0].flags;
5400 }
5401
5402 static float dd_f_imbal(gmx_domdec_t *dd)
5403 {
5404     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5405 }
5406
5407 float dd_pme_f_ratio(gmx_domdec_t *dd)
5408 {
5409     if (dd->comm->cycl_n[ddCyclPME] > 0)
5410     {
5411         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5412     }
5413     else
5414     {
5415         return -1.0;
5416     }
5417 }
5418
5419 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5420 {
5421     int flags,d;
5422     char buf[22];
5423
5424     flags = dd_load_flags(dd);
5425     if (flags)
5426     {
5427         fprintf(fplog,
5428                 "DD  load balancing is limited by minimum cell size in dimension");
5429         for(d=0; d<dd->ndim; d++)
5430         {
5431             if (flags & (1<<d))
5432             {
5433                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5434             }
5435         }
5436         fprintf(fplog,"\n");
5437     }
5438     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5439     if (dd->comm->bDynLoadBal)
5440     {
5441         fprintf(fplog,"  vol min/aver %5.3f%c",
5442                 dd_vol_min(dd),flags ? '!' : ' ');
5443     }
5444     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5445     if (dd->comm->cycl_n[ddCyclPME])
5446     {
5447         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5448     }
5449     fprintf(fplog,"\n\n");
5450 }
5451
5452 static void dd_print_load_verbose(gmx_domdec_t *dd)
5453 {
5454     if (dd->comm->bDynLoadBal)
5455     {
5456         fprintf(stderr,"vol %4.2f%c ",
5457                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5458     }
5459     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5460     if (dd->comm->cycl_n[ddCyclPME])
5461     {
5462         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5463     }
5464 }
5465
5466 #ifdef GMX_MPI
5467 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5468 {
5469     MPI_Comm  c_row;
5470     int  dim, i, rank;
5471     ivec loc_c;
5472     gmx_domdec_root_t *root;
5473     gmx_bool bPartOfGroup = FALSE;
5474
5475     dim = dd->dim[dim_ind];
5476     copy_ivec(loc,loc_c);
5477     for(i=0; i<dd->nc[dim]; i++)
5478     {
5479         loc_c[dim] = i;
5480         rank = dd_index(dd->nc,loc_c);
5481         if (rank == dd->rank)
5482         {
5483             /* This process is part of the group */
5484             bPartOfGroup = TRUE;
5485         }
5486     }
5487     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5488                    &c_row);
5489     if (bPartOfGroup)
5490     {
5491         dd->comm->mpi_comm_load[dim_ind] = c_row;
5492         if (dd->comm->eDLB != edlbNO)
5493         {
5494             if (dd->ci[dim] == dd->master_ci[dim])
5495             {
5496                 /* This is the root process of this row */
5497                 snew(dd->comm->root[dim_ind],1);
5498                 root = dd->comm->root[dim_ind];
5499                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5500                 snew(root->old_cell_f,dd->nc[dim]+1);
5501                 snew(root->bCellMin,dd->nc[dim]);
5502                 if (dim_ind > 0)
5503                 {
5504                     snew(root->cell_f_max0,dd->nc[dim]);
5505                     snew(root->cell_f_min1,dd->nc[dim]);
5506                     snew(root->bound_min,dd->nc[dim]);
5507                     snew(root->bound_max,dd->nc[dim]);
5508                 }
5509                 snew(root->buf_ncd,dd->nc[dim]);
5510             }
5511             else
5512             {
5513                 /* This is not a root process, we only need to receive cell_f */
5514                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5515             }
5516         }
5517         if (dd->ci[dim] == dd->master_ci[dim])
5518         {
5519             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5520         }
5521     }
5522 }
5523 #endif
5524
5525 static void make_load_communicators(gmx_domdec_t *dd)
5526 {
5527 #ifdef GMX_MPI
5528   int  dim0,dim1,i,j;
5529   ivec loc;
5530
5531   if (debug)
5532     fprintf(debug,"Making load communicators\n");
5533
5534   snew(dd->comm->load,dd->ndim);
5535   snew(dd->comm->mpi_comm_load,dd->ndim);
5536
5537   clear_ivec(loc);
5538   make_load_communicator(dd,0,loc);
5539   if (dd->ndim > 1) {
5540     dim0 = dd->dim[0];
5541     for(i=0; i<dd->nc[dim0]; i++) {
5542       loc[dim0] = i;
5543       make_load_communicator(dd,1,loc);
5544     }
5545   }
5546   if (dd->ndim > 2) {
5547     dim0 = dd->dim[0];
5548     for(i=0; i<dd->nc[dim0]; i++) {
5549       loc[dim0] = i;
5550       dim1 = dd->dim[1];
5551       for(j=0; j<dd->nc[dim1]; j++) {
5552           loc[dim1] = j;
5553           make_load_communicator(dd,2,loc);
5554       }
5555     }
5556   }
5557
5558   if (debug)
5559     fprintf(debug,"Finished making load communicators\n");
5560 #endif
5561 }
5562
5563 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5564 {
5565     gmx_bool bZYX;
5566     int  d,dim,i,j,m;
5567     ivec tmp,s;
5568     int  nzone,nzonep;
5569     ivec dd_zp[DD_MAXIZONE];
5570     gmx_domdec_zones_t *zones;
5571     gmx_domdec_ns_ranges_t *izone;
5572
5573     for(d=0; d<dd->ndim; d++)
5574     {
5575         dim = dd->dim[d];
5576         copy_ivec(dd->ci,tmp);
5577         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5578         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5579         copy_ivec(dd->ci,tmp);
5580         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5581         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5582         if (debug)
5583         {
5584             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5585                     dd->rank,dim,
5586                     dd->neighbor[d][0],
5587                     dd->neighbor[d][1]);
5588         }
5589     }
5590
5591     if (fplog)
5592     {
5593         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5594                 dd->ndim,
5595                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5596                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5597     }
5598     switch (dd->ndim)
5599     {
5600     case 3:
5601         nzone  = dd_z3n;
5602         nzonep = dd_zp3n;
5603         for(i=0; i<nzonep; i++)
5604         {
5605             copy_ivec(dd_zp3[i],dd_zp[i]);
5606         }
5607         break;
5608     case 2:
5609         nzone  = dd_z2n;
5610         nzonep = dd_zp2n;
5611         for(i=0; i<nzonep; i++)
5612         {
5613             copy_ivec(dd_zp2[i],dd_zp[i]);
5614         }
5615         break;
5616     case 1:
5617         nzone  = dd_z1n;
5618         nzonep = dd_zp1n;
5619         for(i=0; i<nzonep; i++)
5620         {
5621             copy_ivec(dd_zp1[i],dd_zp[i]);
5622         }
5623         break;
5624     default:
5625         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5626         nzone = 0;
5627         nzonep = 0;
5628     }
5629
5630     zones = &dd->comm->zones;
5631
5632     for(i=0; i<nzone; i++)
5633     {
5634         m = 0;
5635         clear_ivec(zones->shift[i]);
5636         for(d=0; d<dd->ndim; d++)
5637         {
5638             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5639         }
5640     }
5641
5642     zones->n = nzone;
5643     for(i=0; i<nzone; i++)
5644     {
5645         for(d=0; d<DIM; d++)
5646         {
5647             s[d] = dd->ci[d] - zones->shift[i][d];
5648             if (s[d] < 0)
5649             {
5650                 s[d] += dd->nc[d];
5651             }
5652             else if (s[d] >= dd->nc[d])
5653             {
5654                 s[d] -= dd->nc[d];
5655             }
5656         }
5657     }
5658     zones->nizone = nzonep;
5659     for(i=0; i<zones->nizone; i++)
5660     {
5661         if (dd_zp[i][0] != i)
5662         {
5663             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5664         }
5665         izone = &zones->izone[i];
5666         izone->j0 = dd_zp[i][1];
5667         izone->j1 = dd_zp[i][2];
5668         for(dim=0; dim<DIM; dim++)
5669         {
5670             if (dd->nc[dim] == 1)
5671             {
5672                 /* All shifts should be allowed */
5673                 izone->shift0[dim] = -1;
5674                 izone->shift1[dim] = 1;
5675             }
5676             else
5677             {
5678                 /*
5679                   izone->shift0[d] = 0;
5680                   izone->shift1[d] = 0;
5681                   for(j=izone->j0; j<izone->j1; j++) {
5682                   if (dd->shift[j][d] > dd->shift[i][d])
5683                   izone->shift0[d] = -1;
5684                   if (dd->shift[j][d] < dd->shift[i][d])
5685                   izone->shift1[d] = 1;
5686                   }
5687                 */
5688
5689                 int shift_diff;
5690
5691                 /* Assume the shift are not more than 1 cell */
5692                 izone->shift0[dim] = 1;
5693                 izone->shift1[dim] = -1;
5694                 for(j=izone->j0; j<izone->j1; j++)
5695                 {
5696                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5697                     if (shift_diff < izone->shift0[dim])
5698                     {
5699                         izone->shift0[dim] = shift_diff;
5700                     }
5701                     if (shift_diff > izone->shift1[dim])
5702                     {
5703                         izone->shift1[dim] = shift_diff;
5704                     }
5705                 }
5706             }
5707         }
5708     }
5709
5710     if (dd->comm->eDLB != edlbNO)
5711     {
5712         snew(dd->comm->root,dd->ndim);
5713     }
5714
5715     if (dd->comm->bRecordLoad)
5716     {
5717         make_load_communicators(dd);
5718     }
5719 }
5720
5721 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5722 {
5723     gmx_domdec_t *dd;
5724     gmx_domdec_comm_t *comm;
5725     int  i,rank,*buf;
5726     ivec periods;
5727 #ifdef GMX_MPI
5728     MPI_Comm comm_cart;
5729 #endif
5730
5731     dd = cr->dd;
5732     comm = dd->comm;
5733
5734 #ifdef GMX_MPI
5735     if (comm->bCartesianPP)
5736     {
5737         /* Set up cartesian communication for the particle-particle part */
5738         if (fplog)
5739         {
5740             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5741                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5742         }
5743
5744         for(i=0; i<DIM; i++)
5745         {
5746             periods[i] = TRUE;
5747         }
5748         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5749                         &comm_cart);
5750         /* We overwrite the old communicator with the new cartesian one */
5751         cr->mpi_comm_mygroup = comm_cart;
5752     }
5753
5754     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5755     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5756
5757     if (comm->bCartesianPP_PME)
5758     {
5759         /* Since we want to use the original cartesian setup for sim,
5760          * and not the one after split, we need to make an index.
5761          */
5762         snew(comm->ddindex2ddnodeid,dd->nnodes);
5763         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5764         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5765         /* Get the rank of the DD master,
5766          * above we made sure that the master node is a PP node.
5767          */
5768         if (MASTER(cr))
5769         {
5770             rank = dd->rank;
5771         }
5772         else
5773         {
5774             rank = 0;
5775         }
5776         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5777     }
5778     else if (comm->bCartesianPP)
5779     {
5780         if (cr->npmenodes == 0)
5781         {
5782             /* The PP communicator is also
5783              * the communicator for this simulation
5784              */
5785             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5786         }
5787         cr->nodeid = dd->rank;
5788
5789         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5790
5791         /* We need to make an index to go from the coordinates
5792          * to the nodeid of this simulation.
5793          */
5794         snew(comm->ddindex2simnodeid,dd->nnodes);
5795         snew(buf,dd->nnodes);
5796         if (cr->duty & DUTY_PP)
5797         {
5798             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5799         }
5800         /* Communicate the ddindex to simulation nodeid index */
5801         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5802                       cr->mpi_comm_mysim);
5803         sfree(buf);
5804
5805         /* Determine the master coordinates and rank.
5806          * The DD master should be the same node as the master of this sim.
5807          */
5808         for(i=0; i<dd->nnodes; i++)
5809         {
5810             if (comm->ddindex2simnodeid[i] == 0)
5811             {
5812                 ddindex2xyz(dd->nc,i,dd->master_ci);
5813                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5814             }
5815         }
5816         if (debug)
5817         {
5818             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5819         }
5820     }
5821     else
5822     {
5823         /* No Cartesian communicators */
5824         /* We use the rank in dd->comm->all as DD index */
5825         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5826         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5827         dd->masterrank = 0;
5828         clear_ivec(dd->master_ci);
5829     }
5830 #endif
5831
5832     if (fplog)
5833     {
5834         fprintf(fplog,
5835                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5836                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5837     }
5838     if (debug)
5839     {
5840         fprintf(debug,
5841                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5842                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5843     }
5844 }
5845
5846 static void receive_ddindex2simnodeid(t_commrec *cr)
5847 {
5848     gmx_domdec_t *dd;
5849
5850     gmx_domdec_comm_t *comm;
5851     int  *buf;
5852
5853     dd = cr->dd;
5854     comm = dd->comm;
5855
5856 #ifdef GMX_MPI
5857     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5858     {
5859         snew(comm->ddindex2simnodeid,dd->nnodes);
5860         snew(buf,dd->nnodes);
5861         if (cr->duty & DUTY_PP)
5862         {
5863             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5864         }
5865 #ifdef GMX_MPI
5866         /* Communicate the ddindex to simulation nodeid index */
5867         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5868                       cr->mpi_comm_mysim);
5869 #endif
5870         sfree(buf);
5871     }
5872 #endif
5873 }
5874
5875 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5876                                                      int ncg,int natoms)
5877 {
5878     gmx_domdec_master_t *ma;
5879     int i;
5880
5881     snew(ma,1);
5882
5883     snew(ma->ncg,dd->nnodes);
5884     snew(ma->index,dd->nnodes+1);
5885     snew(ma->cg,ncg);
5886     snew(ma->nat,dd->nnodes);
5887     snew(ma->ibuf,dd->nnodes*2);
5888     snew(ma->cell_x,DIM);
5889     for(i=0; i<DIM; i++)
5890     {
5891         snew(ma->cell_x[i],dd->nc[i]+1);
5892     }
5893
5894     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5895     {
5896         ma->vbuf = NULL;
5897     }
5898     else
5899     {
5900         snew(ma->vbuf,natoms);
5901     }
5902
5903     return ma;
5904 }
5905
5906 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5907                                int reorder)
5908 {
5909     gmx_domdec_t *dd;
5910     gmx_domdec_comm_t *comm;
5911     int  i,rank;
5912     gmx_bool bDiv[DIM];
5913     ivec periods;
5914 #ifdef GMX_MPI
5915     MPI_Comm comm_cart;
5916 #endif
5917
5918     dd = cr->dd;
5919     comm = dd->comm;
5920
5921     if (comm->bCartesianPP)
5922     {
5923         for(i=1; i<DIM; i++)
5924         {
5925             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5926         }
5927         if (bDiv[YY] || bDiv[ZZ])
5928         {
5929             comm->bCartesianPP_PME = TRUE;
5930             /* If we have 2D PME decomposition, which is always in x+y,
5931              * we stack the PME only nodes in z.
5932              * Otherwise we choose the direction that provides the thinnest slab
5933              * of PME only nodes as this will have the least effect
5934              * on the PP communication.
5935              * But for the PME communication the opposite might be better.
5936              */
5937             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5938                              !bDiv[YY] ||
5939                              dd->nc[YY] > dd->nc[ZZ]))
5940             {
5941                 comm->cartpmedim = ZZ;
5942             }
5943             else
5944             {
5945                 comm->cartpmedim = YY;
5946             }
5947             comm->ntot[comm->cartpmedim]
5948                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5949         }
5950         else if (fplog)
5951         {
5952             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5953             fprintf(fplog,
5954                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5955         }
5956     }
5957
5958 #ifdef GMX_MPI
5959     if (comm->bCartesianPP_PME)
5960     {
5961         if (fplog)
5962         {
5963             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5964         }
5965
5966         for(i=0; i<DIM; i++)
5967         {
5968             periods[i] = TRUE;
5969         }
5970         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5971                         &comm_cart);
5972
5973         MPI_Comm_rank(comm_cart,&rank);
5974         if (MASTERNODE(cr) && rank != 0)
5975         {
5976             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5977         }
5978
5979         /* With this assigment we loose the link to the original communicator
5980          * which will usually be MPI_COMM_WORLD, unless have multisim.
5981          */
5982         cr->mpi_comm_mysim = comm_cart;
5983         cr->sim_nodeid = rank;
5984
5985         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5986
5987         if (fplog)
5988         {
5989             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5990                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5991         }
5992
5993         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5994         {
5995             cr->duty = DUTY_PP;
5996         }
5997         if (cr->npmenodes == 0 ||
5998             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5999         {
6000             cr->duty = DUTY_PME;
6001         }
6002
6003         /* Split the sim communicator into PP and PME only nodes */
6004         MPI_Comm_split(cr->mpi_comm_mysim,
6005                        cr->duty,
6006                        dd_index(comm->ntot,dd->ci),
6007                        &cr->mpi_comm_mygroup);
6008     }
6009     else
6010     {
6011         switch (dd_node_order)
6012         {
6013         case ddnoPP_PME:
6014             if (fplog)
6015             {
6016                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
6017             }
6018             break;
6019         case ddnoINTERLEAVE:
6020             /* Interleave the PP-only and PME-only nodes,
6021              * as on clusters with dual-core machines this will double
6022              * the communication bandwidth of the PME processes
6023              * and thus speed up the PP <-> PME and inter PME communication.
6024              */
6025             if (fplog)
6026             {
6027                 fprintf(fplog,"Interleaving PP and PME nodes\n");
6028             }
6029             comm->pmenodes = dd_pmenodes(cr);
6030             break;
6031         case ddnoCARTESIAN:
6032             break;
6033         default:
6034             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6035         }
6036
6037         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6038         {
6039             cr->duty = DUTY_PME;
6040         }
6041         else
6042         {
6043             cr->duty = DUTY_PP;
6044         }
6045
6046         /* Split the sim communicator into PP and PME only nodes */
6047         MPI_Comm_split(cr->mpi_comm_mysim,
6048                        cr->duty,
6049                        cr->nodeid,
6050                        &cr->mpi_comm_mygroup);
6051         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6052     }
6053 #endif
6054
6055     if (fplog)
6056     {
6057         fprintf(fplog,"This is a %s only node\n\n",
6058                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6059     }
6060 }
6061
6062 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6063 {
6064     gmx_domdec_t *dd;
6065     gmx_domdec_comm_t *comm;
6066     int CartReorder;
6067
6068     dd = cr->dd;
6069     comm = dd->comm;
6070
6071     copy_ivec(dd->nc,comm->ntot);
6072
6073     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6074     comm->bCartesianPP_PME = FALSE;
6075
6076     /* Reorder the nodes by default. This might change the MPI ranks.
6077      * Real reordering is only supported on very few architectures,
6078      * Blue Gene is one of them.
6079      */
6080     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6081
6082     if (cr->npmenodes > 0)
6083     {
6084         /* Split the communicator into a PP and PME part */
6085         split_communicator(fplog,cr,dd_node_order,CartReorder);
6086         if (comm->bCartesianPP_PME)
6087         {
6088             /* We (possibly) reordered the nodes in split_communicator,
6089              * so it is no longer required in make_pp_communicator.
6090              */
6091             CartReorder = FALSE;
6092         }
6093     }
6094     else
6095     {
6096         /* All nodes do PP and PME */
6097 #ifdef GMX_MPI
6098         /* We do not require separate communicators */
6099         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6100 #endif
6101     }
6102
6103     if (cr->duty & DUTY_PP)
6104     {
6105         /* Copy or make a new PP communicator */
6106         make_pp_communicator(fplog,cr,CartReorder);
6107     }
6108     else
6109     {
6110         receive_ddindex2simnodeid(cr);
6111     }
6112
6113     if (!(cr->duty & DUTY_PME))
6114     {
6115         /* Set up the commnuication to our PME node */
6116         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6117         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6118         if (debug)
6119         {
6120             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6121                     dd->pme_nodeid,dd->pme_receive_vir_ener);
6122         }
6123     }
6124     else
6125     {
6126         dd->pme_nodeid = -1;
6127     }
6128
6129     if (DDMASTER(dd))
6130     {
6131         dd->ma = init_gmx_domdec_master_t(dd,
6132                                           comm->cgs_gl.nr,
6133                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6134     }
6135 }
6136
6137 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6138 {
6139     real *slb_frac,tot;
6140     int  i,n;
6141     double dbl;
6142
6143     slb_frac = NULL;
6144     if (nc > 1 && size_string != NULL)
6145     {
6146         if (fplog)
6147         {
6148             fprintf(fplog,"Using static load balancing for the %s direction\n",
6149                     dir);
6150         }
6151         snew(slb_frac,nc);
6152         tot = 0;
6153         for (i=0; i<nc; i++)
6154         {
6155             dbl = 0;
6156             sscanf(size_string,"%lf%n",&dbl,&n);
6157             if (dbl == 0)
6158             {
6159                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6160             }
6161             slb_frac[i] = dbl;
6162             size_string += n;
6163             tot += slb_frac[i];
6164         }
6165         /* Normalize */
6166         if (fplog)
6167         {
6168             fprintf(fplog,"Relative cell sizes:");
6169         }
6170         for (i=0; i<nc; i++)
6171         {
6172             slb_frac[i] /= tot;
6173             if (fplog)
6174             {
6175                 fprintf(fplog," %5.3f",slb_frac[i]);
6176             }
6177         }
6178         if (fplog)
6179         {
6180             fprintf(fplog,"\n");
6181         }
6182     }
6183
6184     return slb_frac;
6185 }
6186
6187 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6188 {
6189     int n,nmol,ftype;
6190     gmx_mtop_ilistloop_t iloop;
6191     t_ilist *il;
6192
6193     n = 0;
6194     iloop = gmx_mtop_ilistloop_init(mtop);
6195     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6196     {
6197         for(ftype=0; ftype<F_NRE; ftype++)
6198         {
6199             if ((interaction_function[ftype].flags & IF_BOND) &&
6200                 NRAL(ftype) >  2)
6201             {
6202                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6203             }
6204         }
6205   }
6206
6207   return n;
6208 }
6209
6210 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6211 {
6212     char *val;
6213     int  nst;
6214
6215     nst = def;
6216     val = getenv(env_var);
6217     if (val)
6218     {
6219         if (sscanf(val,"%d",&nst) <= 0)
6220         {
6221             nst = 1;
6222         }
6223         if (fplog)
6224         {
6225             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6226                     env_var,val,nst);
6227         }
6228     }
6229
6230     return nst;
6231 }
6232
6233 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6234 {
6235     if (MASTER(cr))
6236     {
6237         fprintf(stderr,"\n%s\n",warn_string);
6238     }
6239     if (fplog)
6240     {
6241         fprintf(fplog,"\n%s\n",warn_string);
6242     }
6243 }
6244
6245 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6246                                   t_inputrec *ir,FILE *fplog)
6247 {
6248     if (ir->ePBC == epbcSCREW &&
6249         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6250     {
6251         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6252     }
6253
6254     if (ir->ns_type == ensSIMPLE)
6255     {
6256         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6257     }
6258
6259     if (ir->nstlist == 0)
6260     {
6261         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6262     }
6263
6264     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6265     {
6266         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6267     }
6268 }
6269
6270 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6271 {
6272     int  di,d;
6273     real r;
6274
6275     r = ddbox->box_size[XX];
6276     for(di=0; di<dd->ndim; di++)
6277     {
6278         d = dd->dim[di];
6279         /* Check using the initial average cell size */
6280         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6281     }
6282
6283     return r;
6284 }
6285
6286 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6287                              const char *dlb_opt,gmx_bool bRecordLoad,
6288                              unsigned long Flags,t_inputrec *ir)
6289 {
6290     gmx_domdec_t *dd;
6291     int  eDLB=-1;
6292     char buf[STRLEN];
6293
6294     switch (dlb_opt[0])
6295     {
6296     case 'a': eDLB = edlbAUTO; break;
6297     case 'n': eDLB = edlbNO;   break;
6298     case 'y': eDLB = edlbYES;  break;
6299     default: gmx_incons("Unknown dlb_opt");
6300     }
6301
6302     if (Flags & MD_RERUN)
6303     {
6304         return edlbNO;
6305     }
6306
6307     if (!EI_DYNAMICS(ir->eI))
6308     {
6309         if (eDLB == edlbYES)
6310         {
6311             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6312             dd_warning(cr,fplog,buf);
6313         }
6314
6315         return edlbNO;
6316     }
6317
6318     if (!bRecordLoad)
6319     {
6320         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6321
6322         return edlbNO;
6323     }
6324
6325     if (Flags & MD_REPRODUCIBLE)
6326     {
6327         switch (eDLB)
6328         {
6329                         case edlbNO:
6330                                 break;
6331                         case edlbAUTO:
6332                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6333                                 eDLB = edlbNO;
6334                                 break;
6335                         case edlbYES:
6336                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6337                                 break;
6338                         default:
6339                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6340                                 break;
6341         }
6342     }
6343
6344     return eDLB;
6345 }
6346
6347 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6348 {
6349     int dim;
6350
6351     dd->ndim = 0;
6352     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6353     {
6354         /* Decomposition order z,y,x */
6355         if (fplog)
6356         {
6357             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6358         }
6359         for(dim=DIM-1; dim>=0; dim--)
6360         {
6361             if (dd->nc[dim] > 1)
6362             {
6363                 dd->dim[dd->ndim++] = dim;
6364             }
6365         }
6366     }
6367     else
6368     {
6369         /* Decomposition order x,y,z */
6370         for(dim=0; dim<DIM; dim++)
6371         {
6372             if (dd->nc[dim] > 1)
6373             {
6374                 dd->dim[dd->ndim++] = dim;
6375             }
6376         }
6377     }
6378 }
6379
6380 static gmx_domdec_comm_t *init_dd_comm()
6381 {
6382     gmx_domdec_comm_t *comm;
6383     int  i;
6384
6385     snew(comm,1);
6386     snew(comm->cggl_flag,DIM*2);
6387     snew(comm->cgcm_state,DIM*2);
6388     for(i=0; i<DIM*2; i++)
6389     {
6390         comm->cggl_flag_nalloc[i]  = 0;
6391         comm->cgcm_state_nalloc[i] = 0;
6392     }
6393
6394     comm->nalloc_int = 0;
6395     comm->buf_int    = NULL;
6396
6397     vec_rvec_init(&comm->vbuf);
6398
6399     comm->n_load_have    = 0;
6400     comm->n_load_collect = 0;
6401
6402     for(i=0; i<ddnatNR-ddnatZONE; i++)
6403     {
6404         comm->sum_nat[i] = 0;
6405     }
6406     comm->ndecomp = 0;
6407     comm->nload   = 0;
6408     comm->load_step = 0;
6409     comm->load_sum  = 0;
6410     comm->load_max  = 0;
6411     clear_ivec(comm->load_lim);
6412     comm->load_mdf  = 0;
6413     comm->load_pme  = 0;
6414
6415     return comm;
6416 }
6417
6418 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6419                                         unsigned long Flags,
6420                                         ivec nc,
6421                                         real comm_distance_min,real rconstr,
6422                                         const char *dlb_opt,real dlb_scale,
6423                                         const char *sizex,const char *sizey,const char *sizez,
6424                                         gmx_mtop_t *mtop,t_inputrec *ir,
6425                                         matrix box,rvec *x,
6426                                         gmx_ddbox_t *ddbox,
6427                                         int *npme_x,int *npme_y)
6428 {
6429     gmx_domdec_t *dd;
6430     gmx_domdec_comm_t *comm;
6431     int  recload;
6432     int  d,i,j;
6433     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6434     gmx_bool bC;
6435     char buf[STRLEN];
6436
6437     if (fplog)
6438     {
6439         fprintf(fplog,
6440                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6441     }
6442
6443     snew(dd,1);
6444
6445     dd->comm = init_dd_comm();
6446     comm = dd->comm;
6447     snew(comm->cggl_flag,DIM*2);
6448     snew(comm->cgcm_state,DIM*2);
6449
6450     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6451     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6452
6453     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6454     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6455     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6456     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6457     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6458     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6459     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6460     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6461
6462     dd->pme_recv_f_alloc = 0;
6463     dd->pme_recv_f_buf = NULL;
6464
6465     if (dd->bSendRecv2 && fplog)
6466     {
6467         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6468     }
6469     if (comm->eFlop)
6470     {
6471         if (fplog)
6472         {
6473             fprintf(fplog,"Will load balance based on FLOP count\n");
6474         }
6475         if (comm->eFlop > 1)
6476         {
6477             srand(1+cr->nodeid);
6478         }
6479         comm->bRecordLoad = TRUE;
6480     }
6481     else
6482     {
6483         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6484
6485     }
6486
6487     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6488
6489     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6490     if (fplog)
6491     {
6492         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6493     }
6494     dd->bGridJump = comm->bDynLoadBal;
6495     comm->bPMELoadBalDLBLimits = FALSE;
6496
6497     if (comm->nstSortCG)
6498     {
6499         if (fplog)
6500         {
6501             if (comm->nstSortCG == 1)
6502             {
6503                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6504             }
6505             else
6506             {
6507                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6508                         comm->nstSortCG);
6509             }
6510         }
6511         snew(comm->sort,1);
6512     }
6513     else
6514     {
6515         if (fplog)
6516         {
6517             fprintf(fplog,"Will not sort the charge groups\n");
6518         }
6519     }
6520
6521     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6522
6523     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6524     if (comm->bInterCGBondeds)
6525     {
6526         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6527     }
6528     else
6529     {
6530         comm->bInterCGMultiBody = FALSE;
6531     }
6532
6533     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6534     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6535
6536     if (ir->rlistlong == 0)
6537     {
6538         /* Set the cut-off to some very large value,
6539          * so we don't need if statements everywhere in the code.
6540          * We use sqrt, since the cut-off is squared in some places.
6541          */
6542         comm->cutoff   = GMX_CUTOFF_INF;
6543     }
6544     else
6545     {
6546         comm->cutoff   = ir->rlistlong;
6547     }
6548     comm->cutoff_mbody = 0;
6549
6550     comm->cellsize_limit = 0;
6551     comm->bBondComm = FALSE;
6552
6553     if (comm->bInterCGBondeds)
6554     {
6555         if (comm_distance_min > 0)
6556         {
6557             comm->cutoff_mbody = comm_distance_min;
6558             if (Flags & MD_DDBONDCOMM)
6559             {
6560                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6561             }
6562             else
6563             {
6564                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6565             }
6566             r_bonded_limit = comm->cutoff_mbody;
6567         }
6568         else if (ir->bPeriodicMols)
6569         {
6570             /* Can not easily determine the required cut-off */
6571             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6572             comm->cutoff_mbody = comm->cutoff/2;
6573             r_bonded_limit = comm->cutoff_mbody;
6574         }
6575         else
6576         {
6577             if (MASTER(cr))
6578             {
6579                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6580                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6581             }
6582             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6583             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6584
6585             /* We use an initial margin of 10% for the minimum cell size,
6586              * except when we are just below the non-bonded cut-off.
6587              */
6588             if (Flags & MD_DDBONDCOMM)
6589             {
6590                 if (max(r_2b,r_mb) > comm->cutoff)
6591                 {
6592                     r_bonded       = max(r_2b,r_mb);
6593                     r_bonded_limit = 1.1*r_bonded;
6594                     comm->bBondComm = TRUE;
6595                 }
6596                 else
6597                 {
6598                     r_bonded       = r_mb;
6599                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6600                 }
6601                 /* We determine cutoff_mbody later */
6602             }
6603             else
6604             {
6605                 /* No special bonded communication,
6606                  * simply increase the DD cut-off.
6607                  */
6608                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6609                 comm->cutoff_mbody = r_bonded_limit;
6610                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6611             }
6612         }
6613         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6614         if (fplog)
6615         {
6616             fprintf(fplog,
6617                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6618                     comm->cellsize_limit);
6619         }
6620     }
6621
6622     if (dd->bInterCGcons && rconstr <= 0)
6623     {
6624         /* There is a cell size limit due to the constraints (P-LINCS) */
6625         rconstr = constr_r_max(fplog,mtop,ir);
6626         if (fplog)
6627         {
6628             fprintf(fplog,
6629                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6630                     rconstr);
6631             if (rconstr > comm->cellsize_limit)
6632             {
6633                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6634             }
6635         }
6636     }
6637     else if (rconstr > 0 && fplog)
6638     {
6639         /* Here we do not check for dd->bInterCGcons,
6640          * because one can also set a cell size limit for virtual sites only
6641          * and at this point we don't know yet if there are intercg v-sites.
6642          */
6643         fprintf(fplog,
6644                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6645                 rconstr);
6646     }
6647     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6648
6649     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6650
6651     if (nc[XX] > 0)
6652     {
6653         copy_ivec(nc,dd->nc);
6654         set_dd_dim(fplog,dd);
6655         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6656
6657         if (cr->npmenodes == -1)
6658         {
6659             cr->npmenodes = 0;
6660         }
6661         acs = average_cellsize_min(dd,ddbox);
6662         if (acs < comm->cellsize_limit)
6663         {
6664             if (fplog)
6665             {
6666                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6667             }
6668             gmx_fatal_collective(FARGS,cr,NULL,
6669                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6670                                  acs,comm->cellsize_limit);
6671         }
6672     }
6673     else
6674     {
6675         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6676
6677         /* We need to choose the optimal DD grid and possibly PME nodes */
6678         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6679                                comm->eDLB!=edlbNO,dlb_scale,
6680                                comm->cellsize_limit,comm->cutoff,
6681                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6682
6683         if (dd->nc[XX] == 0)
6684         {
6685             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6686             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6687                     !bC ? "-rdd" : "-rcon",
6688                     comm->eDLB!=edlbNO ? " or -dds" : "",
6689                     bC ? " or your LINCS settings" : "");
6690
6691             gmx_fatal_collective(FARGS,cr,NULL,
6692                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6693                                  "%s\n"
6694                                  "Look in the log file for details on the domain decomposition",
6695                                  cr->nnodes-cr->npmenodes,limit,buf);
6696         }
6697         set_dd_dim(fplog,dd);
6698     }
6699
6700     if (fplog)
6701     {
6702         fprintf(fplog,
6703                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6704                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6705     }
6706
6707     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6708     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6709     {
6710         gmx_fatal_collective(FARGS,cr,NULL,
6711                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6712                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6713     }
6714     if (cr->npmenodes > dd->nnodes)
6715     {
6716         gmx_fatal_collective(FARGS,cr,NULL,
6717                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6718     }
6719     if (cr->npmenodes > 0)
6720     {
6721         comm->npmenodes = cr->npmenodes;
6722     }
6723     else
6724     {
6725         comm->npmenodes = dd->nnodes;
6726     }
6727
6728     if (EEL_PME(ir->coulombtype))
6729     {
6730         /* The following choices should match those
6731          * in comm_cost_est in domdec_setup.c.
6732          * Note that here the checks have to take into account
6733          * that the decomposition might occur in a different order than xyz
6734          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6735          * in which case they will not match those in comm_cost_est,
6736          * but since that is mainly for testing purposes that's fine.
6737          */
6738         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6739             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6740             getenv("GMX_PMEONEDD") == NULL)
6741         {
6742             comm->npmedecompdim = 2;
6743             comm->npmenodes_x   = dd->nc[XX];
6744             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6745         }
6746         else
6747         {
6748             /* In case nc is 1 in both x and y we could still choose to
6749              * decompose pme in y instead of x, but we use x for simplicity.
6750              */
6751             comm->npmedecompdim = 1;
6752             if (dd->dim[0] == YY)
6753             {
6754                 comm->npmenodes_x = 1;
6755                 comm->npmenodes_y = comm->npmenodes;
6756             }
6757             else
6758             {
6759                 comm->npmenodes_x = comm->npmenodes;
6760                 comm->npmenodes_y = 1;
6761             }
6762         }
6763         if (fplog)
6764         {
6765             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6766                     comm->npmenodes_x,comm->npmenodes_y,1);
6767         }
6768     }
6769     else
6770     {
6771         comm->npmedecompdim = 0;
6772         comm->npmenodes_x   = 0;
6773         comm->npmenodes_y   = 0;
6774     }
6775
6776     /* Technically we don't need both of these,
6777      * but it simplifies code not having to recalculate it.
6778      */
6779     *npme_x = comm->npmenodes_x;
6780     *npme_y = comm->npmenodes_y;
6781
6782     snew(comm->slb_frac,DIM);
6783     if (comm->eDLB == edlbNO)
6784     {
6785         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6786         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6787         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6788     }
6789
6790     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6791     {
6792         if (comm->bBondComm || comm->eDLB != edlbNO)
6793         {
6794             /* Set the bonded communication distance to halfway
6795              * the minimum and the maximum,
6796              * since the extra communication cost is nearly zero.
6797              */
6798             acs = average_cellsize_min(dd,ddbox);
6799             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6800             if (comm->eDLB != edlbNO)
6801             {
6802                 /* Check if this does not limit the scaling */
6803                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6804             }
6805             if (!comm->bBondComm)
6806             {
6807                 /* Without bBondComm do not go beyond the n.b. cut-off */
6808                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6809                 if (comm->cellsize_limit >= comm->cutoff)
6810                 {
6811                     /* We don't loose a lot of efficieny
6812                      * when increasing it to the n.b. cut-off.
6813                      * It can even be slightly faster, because we need
6814                      * less checks for the communication setup.
6815                      */
6816                     comm->cutoff_mbody = comm->cutoff;
6817                 }
6818             }
6819             /* Check if we did not end up below our original limit */
6820             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6821
6822             if (comm->cutoff_mbody > comm->cellsize_limit)
6823             {
6824                 comm->cellsize_limit = comm->cutoff_mbody;
6825             }
6826         }
6827         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6828     }
6829
6830     if (debug)
6831     {
6832         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6833                 "cellsize limit %f\n",
6834                 comm->bBondComm,comm->cellsize_limit);
6835     }
6836
6837     if (MASTER(cr))
6838     {
6839         check_dd_restrictions(cr,dd,ir,fplog);
6840     }
6841
6842     comm->partition_step = INT_MIN;
6843     dd->ddp_count = 0;
6844
6845     clear_dd_cycle_counts(dd);
6846
6847     return dd;
6848 }
6849
6850 static void set_dlb_limits(gmx_domdec_t *dd)
6851
6852 {
6853     int d;
6854
6855     for(d=0; d<dd->ndim; d++)
6856     {
6857         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6858         dd->comm->cellsize_min[dd->dim[d]] =
6859             dd->comm->cellsize_min_dlb[dd->dim[d]];
6860     }
6861 }
6862
6863
6864 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6865 {
6866     gmx_domdec_t *dd;
6867     gmx_domdec_comm_t *comm;
6868     real cellsize_min;
6869     int  d,nc,i;
6870     char buf[STRLEN];
6871
6872     dd = cr->dd;
6873     comm = dd->comm;
6874
6875     if (fplog)
6876     {
6877         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6878     }
6879
6880     cellsize_min = comm->cellsize_min[dd->dim[0]];
6881     for(d=1; d<dd->ndim; d++)
6882     {
6883         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6884     }
6885
6886     if (cellsize_min < comm->cellsize_limit*1.05)
6887     {
6888         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6889
6890         /* Change DLB from "auto" to "no". */
6891         comm->eDLB = edlbNO;
6892
6893         return;
6894     }
6895
6896     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6897     comm->bDynLoadBal = TRUE;
6898     dd->bGridJump = TRUE;
6899
6900     set_dlb_limits(dd);
6901
6902     /* We can set the required cell size info here,
6903      * so we do not need to communicate this.
6904      * The grid is completely uniform.
6905      */
6906     for(d=0; d<dd->ndim; d++)
6907     {
6908         if (comm->root[d])
6909         {
6910             comm->load[d].sum_m = comm->load[d].sum;
6911
6912             nc = dd->nc[dd->dim[d]];
6913             for(i=0; i<nc; i++)
6914             {
6915                 comm->root[d]->cell_f[i]    = i/(real)nc;
6916                 if (d > 0)
6917                 {
6918                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6919                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6920                 }
6921             }
6922             comm->root[d]->cell_f[nc] = 1.0;
6923         }
6924     }
6925 }
6926
6927 static char *init_bLocalCG(gmx_mtop_t *mtop)
6928 {
6929     int  ncg,cg;
6930     char *bLocalCG;
6931
6932     ncg = ncg_mtop(mtop);
6933     snew(bLocalCG,ncg);
6934     for(cg=0; cg<ncg; cg++)
6935     {
6936         bLocalCG[cg] = FALSE;
6937     }
6938
6939     return bLocalCG;
6940 }
6941
6942 void dd_init_bondeds(FILE *fplog,
6943                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6944                      gmx_vsite_t *vsite,gmx_constr_t constr,
6945                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6946 {
6947     gmx_domdec_comm_t *comm;
6948     gmx_bool bBondComm;
6949     int  d;
6950
6951     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6952
6953     comm = dd->comm;
6954
6955     if (comm->bBondComm)
6956     {
6957         /* Communicate atoms beyond the cut-off for bonded interactions */
6958         comm = dd->comm;
6959
6960         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6961
6962         comm->bLocalCG = init_bLocalCG(mtop);
6963     }
6964     else
6965     {
6966         /* Only communicate atoms based on cut-off */
6967         comm->cglink   = NULL;
6968         comm->bLocalCG = NULL;
6969     }
6970 }
6971
6972 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6973                               t_inputrec *ir,
6974                               gmx_bool bDynLoadBal,real dlb_scale,
6975                               gmx_ddbox_t *ddbox)
6976 {
6977     gmx_domdec_comm_t *comm;
6978     int  d;
6979     ivec np;
6980     real limit,shrink;
6981     char buf[64];
6982
6983     if (fplog == NULL)
6984     {
6985         return;
6986     }
6987
6988     comm = dd->comm;
6989
6990     if (bDynLoadBal)
6991     {
6992         fprintf(fplog,"The maximum number of communication pulses is:");
6993         for(d=0; d<dd->ndim; d++)
6994         {
6995             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6996         }
6997         fprintf(fplog,"\n");
6998         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6999         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
7000         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
7001         for(d=0; d<DIM; d++)
7002         {
7003             if (dd->nc[d] > 1)
7004             {
7005                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7006                 {
7007                     shrink = 0;
7008                 }
7009                 else
7010                 {
7011                     shrink =
7012                         comm->cellsize_min_dlb[d]/
7013                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7014                 }
7015                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
7016             }
7017         }
7018         fprintf(fplog,"\n");
7019     }
7020     else
7021     {
7022         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
7023         fprintf(fplog,"The initial number of communication pulses is:");
7024         for(d=0; d<dd->ndim; d++)
7025         {
7026             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7027         }
7028         fprintf(fplog,"\n");
7029         fprintf(fplog,"The initial domain decomposition cell size is:");
7030         for(d=0; d<DIM; d++) {
7031             if (dd->nc[d] > 1)
7032             {
7033                 fprintf(fplog," %c %.2f nm",
7034                         dim2char(d),dd->comm->cellsize_min[d]);
7035             }
7036         }
7037         fprintf(fplog,"\n\n");
7038     }
7039
7040     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7041     {
7042         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7043         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7044                 "non-bonded interactions","",comm->cutoff);
7045
7046         if (bDynLoadBal)
7047         {
7048             limit = dd->comm->cellsize_limit;
7049         }
7050         else
7051         {
7052             if (dynamic_dd_box(ddbox,ir))
7053             {
7054                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7055             }
7056             limit = dd->comm->cellsize_min[XX];
7057             for(d=1; d<DIM; d++)
7058             {
7059                 limit = min(limit,dd->comm->cellsize_min[d]);
7060             }
7061         }
7062
7063         if (comm->bInterCGBondeds)
7064         {
7065             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7066                     "two-body bonded interactions","(-rdd)",
7067                     max(comm->cutoff,comm->cutoff_mbody));
7068             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7069                     "multi-body bonded interactions","(-rdd)",
7070                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7071         }
7072         if (dd->vsite_comm)
7073         {
7074             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7075                     "virtual site constructions","(-rcon)",limit);
7076         }
7077         if (dd->constraint_comm)
7078         {
7079             sprintf(buf,"atoms separated by up to %d constraints",
7080                     1+ir->nProjOrder);
7081             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7082                     buf,"(-rcon)",limit);
7083         }
7084         fprintf(fplog,"\n");
7085     }
7086
7087     fflush(fplog);
7088 }
7089
7090 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7091                                 real dlb_scale,
7092                                 const t_inputrec *ir,
7093                                 const gmx_ddbox_t *ddbox)
7094 {
7095     gmx_domdec_comm_t *comm;
7096     int  d,dim,npulse,npulse_d_max,npulse_d;
7097     gmx_bool bNoCutOff;
7098
7099     comm = dd->comm;
7100
7101     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7102
7103     /* Determine the maximum number of comm. pulses in one dimension */
7104
7105     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7106
7107     /* Determine the maximum required number of grid pulses */
7108     if (comm->cellsize_limit >= comm->cutoff)
7109     {
7110         /* Only a single pulse is required */
7111         npulse = 1;
7112     }
7113     else if (!bNoCutOff && comm->cellsize_limit > 0)
7114     {
7115         /* We round down slightly here to avoid overhead due to the latency
7116          * of extra communication calls when the cut-off
7117          * would be only slightly longer than the cell size.
7118          * Later cellsize_limit is redetermined,
7119          * so we can not miss interactions due to this rounding.
7120          */
7121         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7122     }
7123     else
7124     {
7125         /* There is no cell size limit */
7126         npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7127     }
7128
7129     if (!bNoCutOff && npulse > 1)
7130     {
7131         /* See if we can do with less pulses, based on dlb_scale */
7132         npulse_d_max = 0;
7133         for(d=0; d<dd->ndim; d++)
7134         {
7135             dim = dd->dim[d];
7136             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7137                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7138             npulse_d_max = max(npulse_d_max,npulse_d);
7139         }
7140         npulse = min(npulse,npulse_d_max);
7141     }
7142
7143     /* This env var can override npulse */
7144     d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7145     if (d > 0)
7146     {
7147         npulse = d;
7148     }
7149
7150     comm->maxpulse = 1;
7151     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7152     for(d=0; d<dd->ndim; d++)
7153     {
7154         comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7155         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7156         snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7157         comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7158         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7159         {
7160             comm->bVacDLBNoLimit = FALSE;
7161         }
7162     }
7163
7164     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7165     if (!comm->bVacDLBNoLimit)
7166     {
7167         comm->cellsize_limit = max(comm->cellsize_limit,
7168                                    comm->cutoff/comm->maxpulse);
7169     }
7170     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7171     /* Set the minimum cell size for each DD dimension */
7172     for(d=0; d<dd->ndim; d++)
7173     {
7174         if (comm->bVacDLBNoLimit ||
7175             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7176         {
7177             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7178         }
7179         else
7180         {
7181             comm->cellsize_min_dlb[dd->dim[d]] =
7182                 comm->cutoff/comm->cd[d].np_dlb;
7183         }
7184     }
7185     if (comm->cutoff_mbody <= 0)
7186     {
7187         comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7188     }
7189     if (comm->bDynLoadBal)
7190     {
7191         set_dlb_limits(dd);
7192     }
7193 }
7194
7195 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7196 {
7197     /* If each molecule is a single charge group
7198      * or we use domain decomposition for each periodic dimension,
7199      * we do not need to take pbc into account for the bonded interactions.
7200      */
7201     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7202             !(dd->nc[XX]>1 &&
7203               dd->nc[YY]>1 &&
7204               (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7205 }
7206
7207 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7208                        t_inputrec *ir,t_forcerec *fr,
7209                        gmx_ddbox_t *ddbox)
7210 {
7211     gmx_domdec_comm_t *comm;
7212     int  natoms_tot;
7213     real vol_frac;
7214
7215     comm = dd->comm;
7216
7217     /* Initialize the thread data.
7218      * This can not be done in init_domain_decomposition,
7219      * as the numbers of threads is determined later.
7220      */
7221     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7222     if (comm->nth > 1)
7223     {
7224         snew(comm->dth,comm->nth);
7225     }
7226
7227     if (EEL_PME(ir->coulombtype))
7228     {
7229         init_ddpme(dd,&comm->ddpme[0],0);
7230         if (comm->npmedecompdim >= 2)
7231         {
7232             init_ddpme(dd,&comm->ddpme[1],1);
7233         }
7234     }
7235     else
7236     {
7237         comm->npmenodes = 0;
7238         if (dd->pme_nodeid >= 0)
7239         {
7240             gmx_fatal_collective(FARGS,NULL,dd,
7241                                  "Can not have separate PME nodes without PME electrostatics");
7242         }
7243     }
7244
7245     if (debug)
7246     {
7247         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7248     }
7249     if (comm->eDLB != edlbNO)
7250     {
7251         set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7252     }
7253
7254     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7255     if (comm->eDLB == edlbAUTO)
7256     {
7257         if (fplog)
7258         {
7259             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7260         }
7261         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7262     }
7263
7264     if (ir->ePBC == epbcNONE)
7265     {
7266         vol_frac = 1 - 1/(double)dd->nnodes;
7267     }
7268     else
7269     {
7270         vol_frac =
7271             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7272     }
7273     if (debug)
7274     {
7275         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7276     }
7277     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7278
7279     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7280 }
7281
7282 static gmx_bool test_dd_cutoff(t_commrec *cr,
7283                                t_state *state,t_inputrec *ir,
7284                                real cutoff_req)
7285 {
7286     gmx_domdec_t *dd;
7287     gmx_ddbox_t ddbox;
7288     int d,dim,np;
7289     real inv_cell_size;
7290     int LocallyLimited;
7291
7292     dd = cr->dd;
7293
7294     set_ddbox(dd,FALSE,cr,ir,state->box,
7295               TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7296
7297     LocallyLimited = 0;
7298
7299     for(d=0; d<dd->ndim; d++)
7300     {
7301         dim = dd->dim[d];
7302
7303         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7304         if (dynamic_dd_box(&ddbox,ir))
7305         {
7306             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7307         }
7308
7309         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7310
7311         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7312             dd->comm->cd[d].np_dlb > 0)
7313         {
7314             if (np > dd->comm->cd[d].np_dlb)
7315             {
7316                 return FALSE;
7317             }
7318
7319             /* If a current local cell size is smaller than the requested
7320              * cut-off, we could still fix it, but this gets very complicated.
7321              * Without fixing here, we might actually need more checks.
7322              */
7323             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7324             {
7325                 LocallyLimited = 1;
7326             }
7327         }
7328     }
7329
7330     if (dd->comm->eDLB != edlbNO)
7331     {
7332         /* If DLB is not active yet, we don't need to check the grid jumps.
7333          * Actually we shouldn't, because then the grid jump data is not set.
7334          */
7335         if (dd->comm->bDynLoadBal &&
7336             check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7337         {
7338             LocallyLimited = 1;
7339         }
7340
7341         gmx_sumi(1,&LocallyLimited,cr);
7342
7343         if (LocallyLimited > 0)
7344         {
7345             return FALSE;
7346         }
7347     }
7348
7349     return TRUE;
7350 }
7351
7352 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7353                           real cutoff_req)
7354 {
7355     gmx_bool bCutoffAllowed;
7356
7357     bCutoffAllowed = test_dd_cutoff(cr,state,ir,cutoff_req);
7358
7359     if (bCutoffAllowed)
7360     {
7361         cr->dd->comm->cutoff = cutoff_req;
7362     }
7363
7364     return bCutoffAllowed;
7365 }
7366
7367 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7368 {
7369     gmx_domdec_comm_t *comm;
7370
7371     comm = cr->dd->comm;
7372
7373     /* Turn on the DLB limiting (might have been on already) */
7374     comm->bPMELoadBalDLBLimits = TRUE;
7375
7376     /* Change the cut-off limit */
7377     comm->PMELoadBal_max_cutoff = comm->cutoff;
7378 }
7379
7380 static void merge_cg_buffers(int ncell,
7381                              gmx_domdec_comm_dim_t *cd, int pulse,
7382                              int  *ncg_cell,
7383                              int  *index_gl, int  *recv_i,
7384                              rvec *cg_cm,    rvec *recv_vr,
7385                              int *cgindex,
7386                              cginfo_mb_t *cginfo_mb,int *cginfo)
7387 {
7388     gmx_domdec_ind_t *ind,*ind_p;
7389     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7390     int shift,shift_at;
7391
7392     ind = &cd->ind[pulse];
7393
7394     /* First correct the already stored data */
7395     shift = ind->nrecv[ncell];
7396     for(cell=ncell-1; cell>=0; cell--)
7397     {
7398         shift -= ind->nrecv[cell];
7399         if (shift > 0)
7400         {
7401             /* Move the cg's present from previous grid pulses */
7402             cg0 = ncg_cell[ncell+cell];
7403             cg1 = ncg_cell[ncell+cell+1];
7404             cgindex[cg1+shift] = cgindex[cg1];
7405             for(cg=cg1-1; cg>=cg0; cg--)
7406             {
7407                 index_gl[cg+shift] = index_gl[cg];
7408                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7409                 cgindex[cg+shift] = cgindex[cg];
7410                 cginfo[cg+shift] = cginfo[cg];
7411             }
7412             /* Correct the already stored send indices for the shift */
7413             for(p=1; p<=pulse; p++)
7414             {
7415                 ind_p = &cd->ind[p];
7416                 cg0 = 0;
7417                 for(c=0; c<cell; c++)
7418                 {
7419                     cg0 += ind_p->nsend[c];
7420                 }
7421                 cg1 = cg0 + ind_p->nsend[cell];
7422                 for(cg=cg0; cg<cg1; cg++)
7423                 {
7424                     ind_p->index[cg] += shift;
7425                 }
7426             }
7427         }
7428     }
7429
7430     /* Merge in the communicated buffers */
7431     shift = 0;
7432     shift_at = 0;
7433     cg0 = 0;
7434     for(cell=0; cell<ncell; cell++)
7435     {
7436         cg1 = ncg_cell[ncell+cell+1] + shift;
7437         if (shift_at > 0)
7438         {
7439             /* Correct the old cg indices */
7440             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7441             {
7442                 cgindex[cg+1] += shift_at;
7443             }
7444         }
7445         for(cg=0; cg<ind->nrecv[cell]; cg++)
7446         {
7447             /* Copy this charge group from the buffer */
7448             index_gl[cg1] = recv_i[cg0];
7449             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7450             /* Add it to the cgindex */
7451             cg_gl = index_gl[cg1];
7452             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7453             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7454             cgindex[cg1+1] = cgindex[cg1] + nat;
7455             cg0++;
7456             cg1++;
7457             shift_at += nat;
7458         }
7459         shift += ind->nrecv[cell];
7460         ncg_cell[ncell+cell+1] = cg1;
7461     }
7462 }
7463
7464 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7465                                int nzone,int cg0,const int *cgindex)
7466 {
7467     int cg,zone,p;
7468
7469     /* Store the atom block boundaries for easy copying of communication buffers
7470      */
7471     cg = cg0;
7472     for(zone=0; zone<nzone; zone++)
7473     {
7474         for(p=0; p<cd->np; p++) {
7475             cd->ind[p].cell2at0[zone] = cgindex[cg];
7476             cg += cd->ind[p].nrecv[zone];
7477             cd->ind[p].cell2at1[zone] = cgindex[cg];
7478         }
7479     }
7480 }
7481
7482 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7483 {
7484     int  i;
7485     gmx_bool bMiss;
7486
7487     bMiss = FALSE;
7488     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7489     {
7490         if (!bLocalCG[link->a[i]])
7491         {
7492             bMiss = TRUE;
7493         }
7494     }
7495
7496     return bMiss;
7497 }
7498
7499 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7500 typedef struct {
7501     real c[DIM][4]; /* the corners for the non-bonded communication */
7502     real cr0;       /* corner for rounding */
7503     real cr1[4];    /* corners for rounding */
7504     real bc[DIM];   /* corners for bounded communication */
7505     real bcr1;      /* corner for rounding for bonded communication */
7506 } dd_corners_t;
7507
7508 /* Determine the corners of the domain(s) we are communicating with */
7509 static void
7510 set_dd_corners(const gmx_domdec_t *dd,
7511                int dim0, int dim1, int dim2,
7512                gmx_bool bDistMB,
7513                dd_corners_t *c)
7514 {
7515     const gmx_domdec_comm_t *comm;
7516     const gmx_domdec_zones_t *zones;
7517     int i,j;
7518
7519     comm = dd->comm;
7520
7521     zones = &comm->zones;
7522
7523     /* Keep the compiler happy */
7524     c->cr0  = 0;
7525     c->bcr1 = 0;
7526
7527     /* The first dimension is equal for all cells */
7528     c->c[0][0] = comm->cell_x0[dim0];
7529     if (bDistMB)
7530     {
7531         c->bc[0] = c->c[0][0];
7532     }
7533     if (dd->ndim >= 2)
7534     {
7535         dim1 = dd->dim[1];
7536         /* This cell row is only seen from the first row */
7537         c->c[1][0] = comm->cell_x0[dim1];
7538         /* All rows can see this row */
7539         c->c[1][1] = comm->cell_x0[dim1];
7540         if (dd->bGridJump)
7541         {
7542             c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7543             if (bDistMB)
7544             {
7545                 /* For the multi-body distance we need the maximum */
7546                 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7547             }
7548         }
7549         /* Set the upper-right corner for rounding */
7550         c->cr0 = comm->cell_x1[dim0];
7551
7552         if (dd->ndim >= 3)
7553         {
7554             dim2 = dd->dim[2];
7555             for(j=0; j<4; j++)
7556             {
7557                 c->c[2][j] = comm->cell_x0[dim2];
7558             }
7559             if (dd->bGridJump)
7560             {
7561                 /* Use the maximum of the i-cells that see a j-cell */
7562                 for(i=0; i<zones->nizone; i++)
7563                 {
7564                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7565                     {
7566                         if (j >= 4)
7567                         {
7568                             c->c[2][j-4] =
7569                                 max(c->c[2][j-4],
7570                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7571                         }
7572                     }
7573                 }
7574                 if (bDistMB)
7575                 {
7576                     /* For the multi-body distance we need the maximum */
7577                     c->bc[2] = comm->cell_x0[dim2];
7578                     for(i=0; i<2; i++)
7579                     {
7580                         for(j=0; j<2; j++)
7581                         {
7582                             c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7583                         }
7584                     }
7585                 }
7586             }
7587
7588             /* Set the upper-right corner for rounding */
7589             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7590              * Only cell (0,0,0) can see cell 7 (1,1,1)
7591              */
7592             c->cr1[0] = comm->cell_x1[dim1];
7593             c->cr1[3] = comm->cell_x1[dim1];
7594             if (dd->bGridJump)
7595             {
7596                 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7597                 if (bDistMB)
7598                 {
7599                     /* For the multi-body distance we need the maximum */
7600                     c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7601                 }
7602             }
7603         }
7604     }
7605 }
7606
7607 /* Determine which cg's we need to send in this pulse from this zone */
7608 static void
7609 get_zone_pulse_cgs(gmx_domdec_t *dd,
7610                    int zonei, int zone,
7611                    int cg0, int cg1,
7612                    const int *index_gl,
7613                    const int *cgindex,
7614                    int dim, int dim_ind,
7615                    int dim0, int dim1, int dim2,
7616                    real r_comm2, real r_bcomm2,
7617                    matrix box,
7618                    ivec tric_dist,
7619                    rvec *normal,
7620                    real skew_fac2_d, real skew_fac_01,
7621                    rvec *v_d, rvec *v_0, rvec *v_1,
7622                    const dd_corners_t *c,
7623                    rvec sf2_round,
7624                    gmx_bool bDistBonded,
7625                    gmx_bool bBondComm,
7626                    gmx_bool bDist2B,
7627                    gmx_bool bDistMB,
7628                    rvec *cg_cm,
7629                    int *cginfo,
7630                    gmx_domdec_ind_t *ind,
7631                    int **ibuf, int *ibuf_nalloc,
7632                    vec_rvec_t *vbuf,
7633                    int *nsend_ptr,
7634                    int *nat_ptr,
7635                    int *nsend_z_ptr)
7636 {
7637     gmx_domdec_comm_t *comm;
7638     gmx_bool bScrew;
7639     gmx_bool bDistMB_pulse;
7640     int  cg,i;
7641     real r2,rb2,r,tric_sh;
7642     rvec rn,rb;
7643     int  dimd;
7644     int  nsend_z,nsend,nat;
7645
7646     comm = dd->comm;
7647
7648     bScrew = (dd->bScrewPBC && dim == XX);
7649
7650     bDistMB_pulse = (bDistMB && bDistBonded);
7651
7652     nsend_z = 0;
7653     nsend   = *nsend_ptr;
7654     nat     = *nat_ptr;
7655
7656     for(cg=cg0; cg<cg1; cg++)
7657     {
7658         r2  = 0;
7659         rb2 = 0;
7660         if (tric_dist[dim_ind] == 0)
7661         {
7662             /* Rectangular direction, easy */
7663             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7664             if (r > 0)
7665             {
7666                 r2 += r*r;
7667             }
7668             if (bDistMB_pulse)
7669             {
7670                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7671                 if (r > 0)
7672                 {
7673                     rb2 += r*r;
7674                 }
7675             }
7676             /* Rounding gives at most a 16% reduction
7677              * in communicated atoms
7678              */
7679             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7680             {
7681                 r = cg_cm[cg][dim0] - c->cr0;
7682                 /* This is the first dimension, so always r >= 0 */
7683                 r2 += r*r;
7684                 if (bDistMB_pulse)
7685                 {
7686                     rb2 += r*r;
7687                 }
7688             }
7689             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7690             {
7691                 r = cg_cm[cg][dim1] - c->cr1[zone];
7692                 if (r > 0)
7693                 {
7694                     r2 += r*r;
7695                 }
7696                 if (bDistMB_pulse)
7697                 {
7698                     r = cg_cm[cg][dim1] - c->bcr1;
7699                     if (r > 0)
7700                     {
7701                         rb2 += r*r;
7702                     }
7703                 }
7704             }
7705         }
7706         else
7707         {
7708             /* Triclinic direction, more complicated */
7709             clear_rvec(rn);
7710             clear_rvec(rb);
7711             /* Rounding, conservative as the skew_fac multiplication
7712              * will slightly underestimate the distance.
7713              */
7714             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7715             {
7716                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7717                 for(i=dim0+1; i<DIM; i++)
7718                 {
7719                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7720                 }
7721                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7722                 if (bDistMB_pulse)
7723                 {
7724                     rb[dim0] = rn[dim0];
7725                     rb2 = r2;
7726                 }
7727                 /* Take care that the cell planes along dim0 might not
7728                  * be orthogonal to those along dim1 and dim2.
7729                  */
7730                 for(i=1; i<=dim_ind; i++)
7731                 {
7732                     dimd = dd->dim[i];
7733                     if (normal[dim0][dimd] > 0)
7734                     {
7735                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7736                         if (bDistMB_pulse)
7737                         {
7738                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7739                         }
7740                     }
7741                 }
7742             }
7743             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7744             {
7745                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7746                 tric_sh = 0;
7747                 for(i=dim1+1; i<DIM; i++)
7748                 {
7749                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7750                 }
7751                 rn[dim1] += tric_sh;
7752                 if (rn[dim1] > 0)
7753                 {
7754                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7755                     /* Take care of coupling of the distances
7756                      * to the planes along dim0 and dim1 through dim2.
7757                      */
7758                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7759                     /* Take care that the cell planes along dim1
7760                      * might not be orthogonal to that along dim2.
7761                      */
7762                     if (normal[dim1][dim2] > 0)
7763                     {
7764                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7765                     }
7766                 }
7767                 if (bDistMB_pulse)
7768                 {
7769                     rb[dim1] +=
7770                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7771                     if (rb[dim1] > 0)
7772                     {
7773                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7774                         /* Take care of coupling of the distances
7775                          * to the planes along dim0 and dim1 through dim2.
7776                          */
7777                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7778                         /* Take care that the cell planes along dim1
7779                          * might not be orthogonal to that along dim2.
7780                          */
7781                         if (normal[dim1][dim2] > 0)
7782                         {
7783                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7784                         }
7785                     }
7786                 }
7787             }
7788             /* The distance along the communication direction */
7789             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7790             tric_sh = 0;
7791             for(i=dim+1; i<DIM; i++)
7792             {
7793                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7794             }
7795             rn[dim] += tric_sh;
7796             if (rn[dim] > 0)
7797             {
7798                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7799                 /* Take care of coupling of the distances
7800                  * to the planes along dim0 and dim1 through dim2.
7801                  */
7802                 if (dim_ind == 1 && zonei == 1)
7803                 {
7804                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7805                 }
7806             }
7807             if (bDistMB_pulse)
7808             {
7809                 clear_rvec(rb);
7810                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7811                 if (rb[dim] > 0)
7812                 {
7813                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7814                     /* Take care of coupling of the distances
7815                      * to the planes along dim0 and dim1 through dim2.
7816                      */
7817                     if (dim_ind == 1 && zonei == 1)
7818                     {
7819                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7820                     }
7821                 }
7822             }
7823         }
7824
7825         if (r2 < r_comm2 ||
7826             (bDistBonded &&
7827              ((bDistMB && rb2 < r_bcomm2) ||
7828               (bDist2B && r2  < r_bcomm2)) &&
7829              (!bBondComm ||
7830               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7831                missing_link(comm->cglink,index_gl[cg],
7832                             comm->bLocalCG)))))
7833         {
7834             /* Make an index to the local charge groups */
7835             if (nsend+1 > ind->nalloc)
7836             {
7837                 ind->nalloc = over_alloc_large(nsend+1);
7838                 srenew(ind->index,ind->nalloc);
7839             }
7840             if (nsend+1 > *ibuf_nalloc)
7841             {
7842                 *ibuf_nalloc = over_alloc_large(nsend+1);
7843                 srenew(*ibuf,*ibuf_nalloc);
7844             }
7845             ind->index[nsend] = cg;
7846             (*ibuf)[nsend] = index_gl[cg];
7847             nsend_z++;
7848             vec_rvec_check_alloc(vbuf,nsend+1);
7849
7850             if (dd->ci[dim] == 0)
7851             {
7852                 /* Correct cg_cm for pbc */
7853                 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7854                 if (bScrew)
7855                 {
7856                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7857                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7858                 }
7859             }
7860             else
7861             {
7862                 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7863             }
7864             nsend++;
7865             nat += cgindex[cg+1] - cgindex[cg];
7866         }
7867     }
7868
7869     *nsend_ptr   = nsend;
7870     *nat_ptr     = nat;
7871     *nsend_z_ptr = nsend_z;
7872 }
7873
7874 static void setup_dd_communication(gmx_domdec_t *dd,
7875                                    matrix box,gmx_ddbox_t *ddbox,
7876                                    t_forcerec *fr,t_state *state,rvec **f)
7877 {
7878     int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7879     int nzone,nzone_send,zone,zonei,cg0,cg1;
7880     int c,i,j,cg,cg_gl,nrcg;
7881     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7882     gmx_domdec_comm_t *comm;
7883     gmx_domdec_zones_t *zones;
7884     gmx_domdec_comm_dim_t *cd;
7885     gmx_domdec_ind_t *ind;
7886     cginfo_mb_t *cginfo_mb;
7887     gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7888     real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7889     dd_corners_t corners;
7890     ivec tric_dist;
7891     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7892     real skew_fac2_d,skew_fac_01;
7893     rvec sf2_round;
7894     int  nsend,nat;
7895     int  th;
7896
7897     if (debug)
7898     {
7899         fprintf(debug,"Setting up DD communication\n");
7900     }
7901
7902     comm  = dd->comm;
7903
7904     switch (fr->cutoff_scheme)
7905     {
7906     case ecutsGROUP:
7907         cg_cm = fr->cg_cm;
7908         break;
7909     case ecutsVERLET:
7910         cg_cm = state->x;
7911         break;
7912     default:
7913         gmx_incons("unimplemented");
7914         cg_cm = NULL;
7915     }
7916
7917     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7918     {
7919         dim = dd->dim[dim_ind];
7920
7921         /* Check if we need to use triclinic distances */
7922         tric_dist[dim_ind] = 0;
7923         for(i=0; i<=dim_ind; i++)
7924         {
7925             if (ddbox->tric_dir[dd->dim[i]])
7926             {
7927                 tric_dist[dim_ind] = 1;
7928             }
7929         }
7930     }
7931
7932     bBondComm = comm->bBondComm;
7933
7934     /* Do we need to determine extra distances for multi-body bondeds? */
7935     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7936
7937     /* Do we need to determine extra distances for only two-body bondeds? */
7938     bDist2B = (bBondComm && !bDistMB);
7939
7940     r_comm2  = sqr(comm->cutoff);
7941     r_bcomm2 = sqr(comm->cutoff_mbody);
7942
7943     if (debug)
7944     {
7945         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7946     }
7947
7948     zones = &comm->zones;
7949
7950     dim0 = dd->dim[0];
7951     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7952     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7953
7954     set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7955
7956     /* Triclinic stuff */
7957     normal = ddbox->normal;
7958     skew_fac_01 = 0;
7959     if (dd->ndim >= 2)
7960     {
7961         v_0 = ddbox->v[dim0];
7962         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7963         {
7964             /* Determine the coupling coefficient for the distances
7965              * to the cell planes along dim0 and dim1 through dim2.
7966              * This is required for correct rounding.
7967              */
7968             skew_fac_01 =
7969                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7970             if (debug)
7971             {
7972                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7973             }
7974         }
7975     }
7976     if (dd->ndim >= 3)
7977     {
7978         v_1 = ddbox->v[dim1];
7979     }
7980
7981     zone_cg_range = zones->cg_range;
7982     index_gl = dd->index_gl;
7983     cgindex  = dd->cgindex;
7984     cginfo_mb = fr->cginfo_mb;
7985
7986     zone_cg_range[0]   = 0;
7987     zone_cg_range[1]   = dd->ncg_home;
7988     comm->zone_ncg1[0] = dd->ncg_home;
7989     pos_cg             = dd->ncg_home;
7990
7991     nat_tot = dd->nat_home;
7992     nzone = 1;
7993     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7994     {
7995         dim = dd->dim[dim_ind];
7996         cd = &comm->cd[dim_ind];
7997
7998         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7999         {
8000             /* No pbc in this dimension, the first node should not comm. */
8001             nzone_send = 0;
8002         }
8003         else
8004         {
8005             nzone_send = nzone;
8006         }
8007
8008         v_d = ddbox->v[dim];
8009         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8010
8011         cd->bInPlace = TRUE;
8012         for(p=0; p<cd->np; p++)
8013         {
8014             /* Only atoms communicated in the first pulse are used
8015              * for multi-body bonded interactions or for bBondComm.
8016              */
8017             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8018
8019             ind = &cd->ind[p];
8020             nsend = 0;
8021             nat = 0;
8022             for(zone=0; zone<nzone_send; zone++)
8023             {
8024                 if (tric_dist[dim_ind] && dim_ind > 0)
8025                 {
8026                     /* Determine slightly more optimized skew_fac's
8027                      * for rounding.
8028                      * This reduces the number of communicated atoms
8029                      * by about 10% for 3D DD of rhombic dodecahedra.
8030                      */
8031                     for(dimd=0; dimd<dim; dimd++)
8032                     {
8033                         sf2_round[dimd] = 1;
8034                         if (ddbox->tric_dir[dimd])
8035                         {
8036                             for(i=dd->dim[dimd]+1; i<DIM; i++)
8037                             {
8038                                 /* If we are shifted in dimension i
8039                                  * and the cell plane is tilted forward
8040                                  * in dimension i, skip this coupling.
8041                                  */
8042                                 if (!(zones->shift[nzone+zone][i] &&
8043                                       ddbox->v[dimd][i][dimd] >= 0))
8044                                 {
8045                                     sf2_round[dimd] +=
8046                                         sqr(ddbox->v[dimd][i][dimd]);
8047                                 }
8048                             }
8049                             sf2_round[dimd] = 1/sf2_round[dimd];
8050                         }
8051                     }
8052                 }
8053
8054                 zonei = zone_perm[dim_ind][zone];
8055                 if (p == 0)
8056                 {
8057                     /* Here we permutate the zones to obtain a convenient order
8058                      * for neighbor searching
8059                      */
8060                     cg0 = zone_cg_range[zonei];
8061                     cg1 = zone_cg_range[zonei+1];
8062                 }
8063                 else
8064                 {
8065                     /* Look only at the cg's received in the previous grid pulse
8066                      */
8067                     cg1 = zone_cg_range[nzone+zone+1];
8068                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8069                 }
8070
8071 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8072                 for(th=0; th<comm->nth; th++)
8073                 {
8074                     gmx_domdec_ind_t *ind_p;
8075                     int **ibuf_p,*ibuf_nalloc_p;
8076                     vec_rvec_t *vbuf_p;
8077                     int *nsend_p,*nat_p;
8078                     int *nsend_zone_p;
8079                     int cg0_th,cg1_th;
8080
8081                     if (th == 0)
8082                     {
8083                         /* Thread 0 writes in the comm buffers */
8084                         ind_p         = ind;
8085                         ibuf_p        = &comm->buf_int;
8086                         ibuf_nalloc_p = &comm->nalloc_int;
8087                         vbuf_p        = &comm->vbuf;
8088                         nsend_p       = &nsend;
8089                         nat_p         = &nat;
8090                         nsend_zone_p  = &ind->nsend[zone];
8091                     }
8092                     else
8093                     {
8094                         /* Other threads write into temp buffers */
8095                         ind_p         = &comm->dth[th].ind;
8096                         ibuf_p        = &comm->dth[th].ibuf;
8097                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8098                         vbuf_p        = &comm->dth[th].vbuf;
8099                         nsend_p       = &comm->dth[th].nsend;
8100                         nat_p         = &comm->dth[th].nat;
8101                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8102
8103                         comm->dth[th].nsend      = 0;
8104                         comm->dth[th].nat        = 0;
8105                         comm->dth[th].nsend_zone = 0;
8106                     }
8107
8108                     if (comm->nth == 1)
8109                     {
8110                         cg0_th = cg0;
8111                         cg1_th = cg1;
8112                     }
8113                     else
8114                     {
8115                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8116                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8117                     }
8118
8119                     /* Get the cg's for this pulse in this zone */
8120                     get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8121                                        index_gl,cgindex,
8122                                        dim,dim_ind,dim0,dim1,dim2,
8123                                        r_comm2,r_bcomm2,
8124                                        box,tric_dist,
8125                                        normal,skew_fac2_d,skew_fac_01,
8126                                        v_d,v_0,v_1,&corners,sf2_round,
8127                                        bDistBonded,bBondComm,
8128                                        bDist2B,bDistMB,
8129                                        cg_cm,fr->cginfo,
8130                                        ind_p,
8131                                        ibuf_p,ibuf_nalloc_p,
8132                                        vbuf_p,
8133                                        nsend_p,nat_p,
8134                                        nsend_zone_p);
8135                 }
8136
8137                 /* Append data of threads>=1 to the communication buffers */
8138                 for(th=1; th<comm->nth; th++)
8139                 {
8140                     dd_comm_setup_work_t *dth;
8141                     int i,ns1;
8142
8143                     dth = &comm->dth[th];
8144
8145                     ns1 = nsend + dth->nsend_zone;
8146                     if (ns1 > ind->nalloc)
8147                     {
8148                         ind->nalloc = over_alloc_dd(ns1);
8149                         srenew(ind->index,ind->nalloc);
8150                     }
8151                     if (ns1 > comm->nalloc_int)
8152                     {
8153                         comm->nalloc_int = over_alloc_dd(ns1);
8154                         srenew(comm->buf_int,comm->nalloc_int);
8155                     }
8156                     if (ns1 > comm->vbuf.nalloc)
8157                     {
8158                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8159                         srenew(comm->vbuf.v,comm->vbuf.nalloc);
8160                     }
8161
8162                     for(i=0; i<dth->nsend_zone; i++)
8163                     {
8164                         ind->index[nsend] = dth->ind.index[i];
8165                         comm->buf_int[nsend] = dth->ibuf[i];
8166                         copy_rvec(dth->vbuf.v[i],
8167                                   comm->vbuf.v[nsend]);
8168                         nsend++;
8169                     }
8170                     nat              += dth->nat;
8171                     ind->nsend[zone] += dth->nsend_zone;
8172                 }
8173             }
8174             /* Clear the counts in case we do not have pbc */
8175             for(zone=nzone_send; zone<nzone; zone++)
8176             {
8177                 ind->nsend[zone] = 0;
8178             }
8179             ind->nsend[nzone]   = nsend;
8180             ind->nsend[nzone+1] = nat;
8181             /* Communicate the number of cg's and atoms to receive */
8182             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8183                             ind->nsend, nzone+2,
8184                             ind->nrecv, nzone+2);
8185
8186             /* The rvec buffer is also required for atom buffers of size nsend
8187              * in dd_move_x and dd_move_f.
8188              */
8189             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8190
8191             if (p > 0)
8192             {
8193                 /* We can receive in place if only the last zone is not empty */
8194                 for(zone=0; zone<nzone-1; zone++)
8195                 {
8196                     if (ind->nrecv[zone] > 0)
8197                     {
8198                         cd->bInPlace = FALSE;
8199                     }
8200                 }
8201                 if (!cd->bInPlace)
8202                 {
8203                     /* The int buffer is only required here for the cg indices */
8204                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8205                     {
8206                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8207                         srenew(comm->buf_int2,comm->nalloc_int2);
8208                     }
8209                     /* The rvec buffer is also required for atom buffers
8210                      * of size nrecv in dd_move_x and dd_move_f.
8211                      */
8212                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8213                     vec_rvec_check_alloc(&comm->vbuf2,i);
8214                 }
8215             }
8216
8217             /* Make space for the global cg indices */
8218             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8219                 || dd->cg_nalloc == 0)
8220             {
8221                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8222                 srenew(index_gl,dd->cg_nalloc);
8223                 srenew(cgindex,dd->cg_nalloc+1);
8224             }
8225             /* Communicate the global cg indices */
8226             if (cd->bInPlace)
8227             {
8228                 recv_i = index_gl + pos_cg;
8229             }
8230             else
8231             {
8232                 recv_i = comm->buf_int2;
8233             }
8234             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8235                             comm->buf_int, nsend,
8236                             recv_i,        ind->nrecv[nzone]);
8237
8238             /* Make space for cg_cm */
8239             dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8240             if (fr->cutoff_scheme == ecutsGROUP)
8241             {
8242                 cg_cm = fr->cg_cm;
8243             }
8244             else
8245             {
8246                 cg_cm = state->x;
8247             }
8248             /* Communicate cg_cm */
8249             if (cd->bInPlace)
8250             {
8251                 recv_vr = cg_cm + pos_cg;
8252             }
8253             else
8254             {
8255                 recv_vr = comm->vbuf2.v;
8256             }
8257             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8258                              comm->vbuf.v, nsend,
8259                              recv_vr,      ind->nrecv[nzone]);
8260
8261             /* Make the charge group index */
8262             if (cd->bInPlace)
8263             {
8264                 zone = (p == 0 ? 0 : nzone - 1);
8265                 while (zone < nzone)
8266                 {
8267                     for(cg=0; cg<ind->nrecv[zone]; cg++)
8268                     {
8269                         cg_gl = index_gl[pos_cg];
8270                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8271                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8272                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8273                         if (bBondComm)
8274                         {
8275                             /* Update the charge group presence,
8276                              * so we can use it in the next pass of the loop.
8277                              */
8278                             comm->bLocalCG[cg_gl] = TRUE;
8279                         }
8280                         pos_cg++;
8281                     }
8282                     if (p == 0)
8283                     {
8284                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8285                     }
8286                     zone++;
8287                     zone_cg_range[nzone+zone] = pos_cg;
8288                 }
8289             }
8290             else
8291             {
8292                 /* This part of the code is never executed with bBondComm. */
8293                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8294                                  index_gl,recv_i,cg_cm,recv_vr,
8295                                  cgindex,fr->cginfo_mb,fr->cginfo);
8296                 pos_cg += ind->nrecv[nzone];
8297             }
8298             nat_tot += ind->nrecv[nzone+1];
8299         }
8300         if (!cd->bInPlace)
8301         {
8302             /* Store the atom block for easy copying of communication buffers */
8303             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8304         }
8305         nzone += nzone;
8306     }
8307     dd->index_gl = index_gl;
8308     dd->cgindex  = cgindex;
8309
8310     dd->ncg_tot = zone_cg_range[zones->n];
8311     dd->nat_tot = nat_tot;
8312     comm->nat[ddnatHOME] = dd->nat_home;
8313     for(i=ddnatZONE; i<ddnatNR; i++)
8314     {
8315         comm->nat[i] = dd->nat_tot;
8316     }
8317
8318     if (!bBondComm)
8319     {
8320         /* We don't need to update cginfo, since that was alrady done above.
8321          * So we pass NULL for the forcerec.
8322          */
8323         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8324                       NULL,comm->bLocalCG);
8325     }
8326
8327     if (debug)
8328     {
8329         fprintf(debug,"Finished setting up DD communication, zones:");
8330         for(c=0; c<zones->n; c++)
8331         {
8332             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8333         }
8334         fprintf(debug,"\n");
8335     }
8336 }
8337
8338 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8339 {
8340     int c;
8341
8342     for(c=0; c<zones->nizone; c++)
8343     {
8344         zones->izone[c].cg1  = zones->cg_range[c+1];
8345         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8346         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8347     }
8348 }
8349
8350 static void set_zones_size(gmx_domdec_t *dd,
8351                            matrix box,const gmx_ddbox_t *ddbox,
8352                            int zone_start,int zone_end)
8353 {
8354     gmx_domdec_comm_t *comm;
8355     gmx_domdec_zones_t *zones;
8356     gmx_bool bDistMB;
8357     int  z,zi,zj0,zj1,d,dim;
8358     real rcs,rcmbs;
8359     int  i,j;
8360     real size_j,add_tric;
8361     real vol;
8362
8363     comm = dd->comm;
8364
8365     zones = &comm->zones;
8366
8367     /* Do we need to determine extra distances for multi-body bondeds? */
8368     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8369
8370     for(z=zone_start; z<zone_end; z++)
8371     {
8372         /* Copy cell limits to zone limits.
8373          * Valid for non-DD dims and non-shifted dims.
8374          */
8375         copy_rvec(comm->cell_x0,zones->size[z].x0);
8376         copy_rvec(comm->cell_x1,zones->size[z].x1);
8377     }
8378
8379     for(d=0; d<dd->ndim; d++)
8380     {
8381         dim = dd->dim[d];
8382
8383         for(z=0; z<zones->n; z++)
8384         {
8385             /* With a staggered grid we have different sizes
8386              * for non-shifted dimensions.
8387              */
8388             if (dd->bGridJump && zones->shift[z][dim] == 0)
8389             {
8390                 if (d == 1)
8391                 {
8392                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8393                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8394                 }
8395                 else if (d == 2)
8396                 {
8397                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8398                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8399                 }
8400             }
8401         }
8402
8403         rcs   = comm->cutoff;
8404         rcmbs = comm->cutoff_mbody;
8405         if (ddbox->tric_dir[dim])
8406         {
8407             rcs   /= ddbox->skew_fac[dim];
8408             rcmbs /= ddbox->skew_fac[dim];
8409         }
8410
8411         /* Set the lower limit for the shifted zone dimensions */
8412         for(z=zone_start; z<zone_end; z++)
8413         {
8414             if (zones->shift[z][dim] > 0)
8415             {
8416                 dim = dd->dim[d];
8417                 if (!dd->bGridJump || d == 0)
8418                 {
8419                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8420                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8421                 }
8422                 else
8423                 {
8424                     /* Here we take the lower limit of the zone from
8425                      * the lowest domain of the zone below.
8426                      */
8427                     if (z < 4)
8428                     {
8429                         zones->size[z].x0[dim] =
8430                              comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8431                     }
8432                     else
8433                     {
8434                         if (d == 1)
8435                         {
8436                             zones->size[z].x0[dim] =
8437                                 zones->size[zone_perm[2][z-4]].x0[dim];
8438                         }
8439                         else
8440                         {
8441                             zones->size[z].x0[dim] =
8442                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8443                         }
8444                     }
8445                     /* A temporary limit, is updated below */
8446                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8447
8448                     if (bDistMB)
8449                     {
8450                         for(zi=0; zi<zones->nizone; zi++)
8451                         {
8452                             if (zones->shift[zi][dim] == 0)
8453                             {
8454                                 /* This takes the whole zone into account.
8455                                  * With multiple pulses this will lead
8456                                  * to a larger zone then strictly necessary.
8457                                  */
8458                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8459                                                              zones->size[zi].x1[dim]+rcmbs);
8460                             }
8461                         }
8462                     }
8463                 }
8464             }
8465         }
8466
8467         /* Loop over the i-zones to set the upper limit of each
8468          * j-zone they see.
8469          */
8470         for(zi=0; zi<zones->nizone; zi++)
8471         {
8472             if (zones->shift[zi][dim] == 0)
8473             {
8474                 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8475                 {
8476                     if (zones->shift[z][dim] > 0)
8477                     {
8478                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8479                                                      zones->size[zi].x1[dim]+rcs);
8480                     }
8481                 }
8482             }
8483         }
8484     }
8485
8486     for(z=zone_start; z<zone_end; z++)
8487     {
8488         /* Initialization only required to keep the compiler happy */
8489         rvec corner_min={0,0,0},corner_max={0,0,0},corner;
8490         int  nc,c;
8491
8492         /* To determine the bounding box for a zone we need to find
8493          * the extreme corners of 4, 2 or 1 corners.
8494          */
8495         nc = 1 << (ddbox->npbcdim - 1);
8496
8497         for(c=0; c<nc; c++)
8498         {
8499             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8500             corner[XX] = 0;
8501             if ((c & 1) == 0)
8502             {
8503                 corner[YY] = zones->size[z].x0[YY];
8504             }
8505             else
8506             {
8507                 corner[YY] = zones->size[z].x1[YY];
8508             }
8509             if ((c & 2) == 0)
8510             {
8511                 corner[ZZ] = zones->size[z].x0[ZZ];
8512             }
8513             else
8514             {
8515                 corner[ZZ] = zones->size[z].x1[ZZ];
8516             }
8517             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8518             {
8519                 /* With 1D domain decomposition the cg's are not in
8520                  * the triclinic box, but triclinic x-y and rectangular y-z.
8521                  * Shift y back, so it will later end up at 0.
8522                  */
8523                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8524             }
8525             /* Apply the triclinic couplings */
8526             for(i=YY; i<ddbox->npbcdim; i++)
8527             {
8528                 for(j=XX; j<i; j++)
8529                 {
8530                     corner[j] += corner[i]*box[i][j]/box[i][i];
8531                 }
8532             }
8533             if (c == 0)
8534             {
8535                 copy_rvec(corner,corner_min);
8536                 copy_rvec(corner,corner_max);
8537             }
8538             else
8539             {
8540                 for(i=0; i<DIM; i++)
8541                 {
8542                     corner_min[i] = min(corner_min[i],corner[i]);
8543                     corner_max[i] = max(corner_max[i],corner[i]);
8544                 }
8545             }
8546         }
8547         /* Copy the extreme cornes without offset along x */
8548         for(i=0; i<DIM; i++)
8549         {
8550             zones->size[z].bb_x0[i] = corner_min[i];
8551             zones->size[z].bb_x1[i] = corner_max[i];
8552         }
8553         /* Add the offset along x */
8554         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8555         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8556     }
8557
8558     if (zone_start == 0)
8559     {
8560         vol = 1;
8561         for(dim=0; dim<DIM; dim++)
8562         {
8563             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8564         }
8565         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8566     }
8567
8568     if (debug)
8569     {
8570         for(z=zone_start; z<zone_end; z++)
8571         {
8572             fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8573                     z,
8574                     zones->size[z].x0[XX],zones->size[z].x1[XX],
8575                     zones->size[z].x0[YY],zones->size[z].x1[YY],
8576                     zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8577             fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8578                     z,
8579                     zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8580                     zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8581                     zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8582         }
8583     }
8584 }
8585
8586 static int comp_cgsort(const void *a,const void *b)
8587 {
8588     int comp;
8589
8590     gmx_cgsort_t *cga,*cgb;
8591     cga = (gmx_cgsort_t *)a;
8592     cgb = (gmx_cgsort_t *)b;
8593
8594     comp = cga->nsc - cgb->nsc;
8595     if (comp == 0)
8596     {
8597         comp = cga->ind_gl - cgb->ind_gl;
8598     }
8599
8600     return comp;
8601 }
8602
8603 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8604                          int *a,int *buf)
8605 {
8606     int i;
8607
8608     /* Order the data */
8609     for(i=0; i<n; i++)
8610     {
8611         buf[i] = a[sort[i].ind];
8612     }
8613
8614     /* Copy back to the original array */
8615     for(i=0; i<n; i++)
8616     {
8617         a[i] = buf[i];
8618     }
8619 }
8620
8621 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8622                          rvec *v,rvec *buf)
8623 {
8624     int i;
8625
8626     /* Order the data */
8627     for(i=0; i<n; i++)
8628     {
8629         copy_rvec(v[sort[i].ind],buf[i]);
8630     }
8631
8632     /* Copy back to the original array */
8633     for(i=0; i<n; i++)
8634     {
8635         copy_rvec(buf[i],v[i]);
8636     }
8637 }
8638
8639 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8640                            rvec *v,rvec *buf)
8641 {
8642     int a,atot,cg,cg0,cg1,i;
8643
8644     if (cgindex == NULL)
8645     {
8646         /* Avoid the useless loop of the atoms within a cg */
8647         order_vec_cg(ncg,sort,v,buf);
8648
8649         return;
8650     }
8651
8652     /* Order the data */
8653     a = 0;
8654     for(cg=0; cg<ncg; cg++)
8655     {
8656         cg0 = cgindex[sort[cg].ind];
8657         cg1 = cgindex[sort[cg].ind+1];
8658         for(i=cg0; i<cg1; i++)
8659         {
8660             copy_rvec(v[i],buf[a]);
8661             a++;
8662         }
8663     }
8664     atot = a;
8665
8666     /* Copy back to the original array */
8667     for(a=0; a<atot; a++)
8668     {
8669         copy_rvec(buf[a],v[a]);
8670     }
8671 }
8672
8673 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8674                          int nsort_new,gmx_cgsort_t *sort_new,
8675                          gmx_cgsort_t *sort1)
8676 {
8677     int i1,i2,i_new;
8678
8679     /* The new indices are not very ordered, so we qsort them */
8680     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8681
8682     /* sort2 is already ordered, so now we can merge the two arrays */
8683     i1 = 0;
8684     i2 = 0;
8685     i_new = 0;
8686     while(i2 < nsort2 || i_new < nsort_new)
8687     {
8688         if (i2 == nsort2)
8689         {
8690             sort1[i1++] = sort_new[i_new++];
8691         }
8692         else if (i_new == nsort_new)
8693         {
8694             sort1[i1++] = sort2[i2++];
8695         }
8696         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8697                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8698                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8699         {
8700             sort1[i1++] = sort2[i2++];
8701         }
8702         else
8703         {
8704             sort1[i1++] = sort_new[i_new++];
8705         }
8706     }
8707 }
8708
8709 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8710 {
8711     gmx_domdec_sort_t *sort;
8712     gmx_cgsort_t *cgsort,*sort_i;
8713     int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8714     int  sort_last,sort_skip;
8715
8716     sort = dd->comm->sort;
8717
8718     a = fr->ns.grid->cell_index;
8719
8720     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8721
8722     if (ncg_home_old >= 0)
8723     {
8724         /* The charge groups that remained in the same ns grid cell
8725          * are completely ordered. So we can sort efficiently by sorting
8726          * the charge groups that did move into the stationary list.
8727          */
8728         ncg_new = 0;
8729         nsort2 = 0;
8730         nsort_new = 0;
8731         for(i=0; i<dd->ncg_home; i++)
8732         {
8733             /* Check if this cg did not move to another node */
8734             if (a[i] < moved)
8735             {
8736                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8737                 {
8738                     /* This cg is new on this node or moved ns grid cell */
8739                     if (nsort_new >= sort->sort_new_nalloc)
8740                     {
8741                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8742                         srenew(sort->sort_new,sort->sort_new_nalloc);
8743                     }
8744                     sort_i = &(sort->sort_new[nsort_new++]);
8745                 }
8746                 else
8747                 {
8748                     /* This cg did not move */
8749                     sort_i = &(sort->sort2[nsort2++]);
8750                 }
8751                 /* Sort on the ns grid cell indices
8752                  * and the global topology index.
8753                  * index_gl is irrelevant with cell ns,
8754                  * but we set it here anyhow to avoid a conditional.
8755                  */
8756                 sort_i->nsc    = a[i];
8757                 sort_i->ind_gl = dd->index_gl[i];
8758                 sort_i->ind    = i;
8759                 ncg_new++;
8760             }
8761         }
8762         if (debug)
8763         {
8764             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8765                     nsort2,nsort_new);
8766         }
8767         /* Sort efficiently */
8768         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8769                      sort->sort);
8770     }
8771     else
8772     {
8773         cgsort = sort->sort;
8774         ncg_new = 0;
8775         for(i=0; i<dd->ncg_home; i++)
8776         {
8777             /* Sort on the ns grid cell indices
8778              * and the global topology index
8779              */
8780             cgsort[i].nsc    = a[i];
8781             cgsort[i].ind_gl = dd->index_gl[i];
8782             cgsort[i].ind    = i;
8783             if (cgsort[i].nsc < moved)
8784             {
8785                 ncg_new++;
8786             }
8787         }
8788         if (debug)
8789         {
8790             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8791         }
8792         /* Determine the order of the charge groups using qsort */
8793         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8794     }
8795
8796     return ncg_new;
8797 }
8798
8799 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8800 {
8801     gmx_cgsort_t *sort;
8802     int  ncg_new,i,*a,na;
8803
8804     sort = dd->comm->sort->sort;
8805
8806     nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8807
8808     ncg_new = 0;
8809     for(i=0; i<na; i++)
8810     {
8811         if (a[i] >= 0)
8812         {
8813             sort[ncg_new].ind = a[i];
8814             ncg_new++;
8815         }
8816     }
8817
8818     return ncg_new;
8819 }
8820
8821 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8822                           rvec *cgcm,t_forcerec *fr,t_state *state,
8823                           int ncg_home_old)
8824 {
8825     gmx_domdec_sort_t *sort;
8826     gmx_cgsort_t *cgsort,*sort_i;
8827     int  *cgindex;
8828     int  ncg_new,i,*ibuf,cgsize;
8829     rvec *vbuf;
8830
8831     sort = dd->comm->sort;
8832
8833     if (dd->ncg_home > sort->sort_nalloc)
8834     {
8835         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8836         srenew(sort->sort,sort->sort_nalloc);
8837         srenew(sort->sort2,sort->sort_nalloc);
8838     }
8839     cgsort = sort->sort;
8840
8841     switch (fr->cutoff_scheme)
8842     {
8843     case ecutsGROUP:
8844         ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8845         break;
8846     case ecutsVERLET:
8847         ncg_new = dd_sort_order_nbnxn(dd,fr);
8848         break;
8849     default:
8850         gmx_incons("unimplemented");
8851         ncg_new = 0;
8852     }
8853
8854     /* We alloc with the old size, since cgindex is still old */
8855     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8856     vbuf = dd->comm->vbuf.v;
8857
8858     if (dd->comm->bCGs)
8859     {
8860         cgindex = dd->cgindex;
8861     }
8862     else
8863     {
8864         cgindex = NULL;
8865     }
8866
8867     /* Remove the charge groups which are no longer at home here */
8868     dd->ncg_home = ncg_new;
8869     if (debug)
8870     {
8871         fprintf(debug,"Set the new home charge group count to %d\n",
8872                 dd->ncg_home);
8873     }
8874
8875     /* Reorder the state */
8876     for(i=0; i<estNR; i++)
8877     {
8878         if (EST_DISTR(i) && (state->flags & (1<<i)))
8879         {
8880             switch (i)
8881             {
8882             case estX:
8883                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8884                 break;
8885             case estV:
8886                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8887                 break;
8888             case estSDX:
8889                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8890                 break;
8891             case estCGP:
8892                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8893                 break;
8894             case estLD_RNG:
8895             case estLD_RNGI:
8896             case estDISRE_INITF:
8897             case estDISRE_RM3TAV:
8898             case estORIRE_INITF:
8899             case estORIRE_DTAV:
8900                 /* No ordering required */
8901                 break;
8902             default:
8903                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8904                 break;
8905             }
8906         }
8907     }
8908     if (fr->cutoff_scheme == ecutsGROUP)
8909     {
8910         /* Reorder cgcm */
8911         order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8912     }
8913
8914     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8915     {
8916         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8917         srenew(sort->ibuf,sort->ibuf_nalloc);
8918     }
8919     ibuf = sort->ibuf;
8920     /* Reorder the global cg index */
8921     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8922     /* Reorder the cginfo */
8923     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8924     /* Rebuild the local cg index */
8925     if (dd->comm->bCGs)
8926     {
8927         ibuf[0] = 0;
8928         for(i=0; i<dd->ncg_home; i++)
8929         {
8930             cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8931             ibuf[i+1] = ibuf[i] + cgsize;
8932         }
8933         for(i=0; i<dd->ncg_home+1; i++)
8934         {
8935             dd->cgindex[i] = ibuf[i];
8936         }
8937     }
8938     else
8939     {
8940         for(i=0; i<dd->ncg_home+1; i++)
8941         {
8942             dd->cgindex[i] = i;
8943         }
8944     }
8945     /* Set the home atom number */
8946     dd->nat_home = dd->cgindex[dd->ncg_home];
8947
8948     if (fr->cutoff_scheme == ecutsVERLET)
8949     {
8950         /* The atoms are now exactly in grid order, update the grid order */
8951         nbnxn_set_atomorder(fr->nbv->nbs);
8952     }
8953     else
8954     {
8955         /* Copy the sorted ns cell indices back to the ns grid struct */
8956         for(i=0; i<dd->ncg_home; i++)
8957         {
8958             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8959         }
8960         fr->ns.grid->nr = dd->ncg_home;
8961     }
8962 }
8963
8964 static void add_dd_statistics(gmx_domdec_t *dd)
8965 {
8966     gmx_domdec_comm_t *comm;
8967     int ddnat;
8968
8969     comm = dd->comm;
8970
8971     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8972     {
8973         comm->sum_nat[ddnat-ddnatZONE] +=
8974             comm->nat[ddnat] - comm->nat[ddnat-1];
8975     }
8976     comm->ndecomp++;
8977 }
8978
8979 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8980 {
8981     gmx_domdec_comm_t *comm;
8982     int ddnat;
8983
8984     comm = dd->comm;
8985
8986     /* Reset all the statistics and counters for total run counting */
8987     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8988     {
8989         comm->sum_nat[ddnat-ddnatZONE] = 0;
8990     }
8991     comm->ndecomp = 0;
8992     comm->nload = 0;
8993     comm->load_step = 0;
8994     comm->load_sum = 0;
8995     comm->load_max = 0;
8996     clear_ivec(comm->load_lim);
8997     comm->load_mdf = 0;
8998     comm->load_pme = 0;
8999 }
9000
9001 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
9002 {
9003     gmx_domdec_comm_t *comm;
9004     int ddnat;
9005     double av;
9006
9007     comm = cr->dd->comm;
9008
9009     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
9010
9011     if (fplog == NULL)
9012     {
9013         return;
9014     }
9015
9016     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9017
9018     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
9019     {
9020         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9021         switch(ddnat)
9022         {
9023         case ddnatZONE:
9024             fprintf(fplog,
9025                     " av. #atoms communicated per step for force:  %d x %.1f\n",
9026                     2,av);
9027             break;
9028         case ddnatVSITE:
9029             if (cr->dd->vsite_comm)
9030             {
9031                 fprintf(fplog,
9032                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
9033                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
9034                         av);
9035             }
9036             break;
9037         case ddnatCON:
9038             if (cr->dd->constraint_comm)
9039             {
9040                 fprintf(fplog,
9041                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9042                         1 + ir->nLincsIter,av);
9043             }
9044             break;
9045         default:
9046             gmx_incons(" Unknown type for DD statistics");
9047         }
9048     }
9049     fprintf(fplog,"\n");
9050
9051     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9052     {
9053         print_dd_load_av(fplog,cr->dd);
9054     }
9055 }
9056
9057 void dd_partition_system(FILE            *fplog,
9058                          gmx_large_int_t      step,
9059                          t_commrec       *cr,
9060                          gmx_bool            bMasterState,
9061                          int             nstglobalcomm,
9062                          t_state         *state_global,
9063                          gmx_mtop_t      *top_global,
9064                          t_inputrec      *ir,
9065                          t_state         *state_local,
9066                          rvec            **f,
9067                          t_mdatoms       *mdatoms,
9068                          gmx_localtop_t  *top_local,
9069                          t_forcerec      *fr,
9070                          gmx_vsite_t     *vsite,
9071                          gmx_shellfc_t   shellfc,
9072                          gmx_constr_t    constr,
9073                          t_nrnb          *nrnb,
9074                          gmx_wallcycle_t wcycle,
9075                          gmx_bool            bVerbose)
9076 {
9077     gmx_domdec_t *dd;
9078     gmx_domdec_comm_t *comm;
9079     gmx_ddbox_t ddbox={0};
9080     t_block *cgs_gl;
9081     gmx_large_int_t step_pcoupl;
9082     rvec cell_ns_x0,cell_ns_x1;
9083     int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
9084     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
9085     gmx_bool bRedist,bSortCG,bResortAll;
9086     ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
9087     real grid_density;
9088     char sbuf[22];
9089
9090     dd = cr->dd;
9091     comm = dd->comm;
9092
9093     bBoxChanged = (bMasterState || DEFORM(*ir));
9094     if (ir->epc != epcNO)
9095     {
9096         /* With nstpcouple > 1 pressure coupling happens.
9097          * one step after calculating the pressure.
9098          * Box scaling happens at the end of the MD step,
9099          * after the DD partitioning.
9100          * We therefore have to do DLB in the first partitioning
9101          * after an MD step where P-coupling occured.
9102          * We need to determine the last step in which p-coupling occurred.
9103          * MRS -- need to validate this for vv?
9104          */
9105         n = ir->nstpcouple;
9106         if (n == 1)
9107         {
9108             step_pcoupl = step - 1;
9109         }
9110         else
9111         {
9112             step_pcoupl = ((step - 1)/n)*n + 1;
9113         }
9114         if (step_pcoupl >= comm->partition_step)
9115         {
9116             bBoxChanged = TRUE;
9117         }
9118     }
9119
9120     bNStGlobalComm = (step % nstglobalcomm == 0);
9121
9122     if (!comm->bDynLoadBal)
9123     {
9124         bDoDLB = FALSE;
9125     }
9126     else
9127     {
9128         /* Should we do dynamic load balacing this step?
9129          * Since it requires (possibly expensive) global communication,
9130          * we might want to do DLB less frequently.
9131          */
9132         if (bBoxChanged || ir->epc != epcNO)
9133         {
9134             bDoDLB = bBoxChanged;
9135         }
9136         else
9137         {
9138             bDoDLB = bNStGlobalComm;
9139         }
9140     }
9141
9142     /* Check if we have recorded loads on the nodes */
9143     if (comm->bRecordLoad && dd_load_count(comm))
9144     {
9145         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9146         {
9147             /* Check if we should use DLB at the second partitioning
9148              * and every 100 partitionings,
9149              * so the extra communication cost is negligible.
9150              */
9151             n = max(100,nstglobalcomm);
9152             bCheckDLB = (comm->n_load_collect == 0 ||
9153                          comm->n_load_have % n == n-1);
9154         }
9155         else
9156         {
9157             bCheckDLB = FALSE;
9158         }
9159
9160         /* Print load every nstlog, first and last step to the log file */
9161         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9162                     comm->n_load_collect == 0 ||
9163                     (ir->nsteps >= 0 &&
9164                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9165
9166         /* Avoid extra communication due to verbose screen output
9167          * when nstglobalcomm is set.
9168          */
9169         if (bDoDLB || bLogLoad || bCheckDLB ||
9170             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9171         {
9172             get_load_distribution(dd,wcycle);
9173             if (DDMASTER(dd))
9174             {
9175                 if (bLogLoad)
9176                 {
9177                     dd_print_load(fplog,dd,step-1);
9178                 }
9179                 if (bVerbose)
9180                 {
9181                     dd_print_load_verbose(dd);
9182                 }
9183             }
9184             comm->n_load_collect++;
9185
9186             if (bCheckDLB) {
9187                 /* Since the timings are node dependent, the master decides */
9188                 if (DDMASTER(dd))
9189                 {
9190                     bTurnOnDLB =
9191                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9192                     if (debug)
9193                     {
9194                         fprintf(debug,"step %s, imb loss %f\n",
9195                                 gmx_step_str(step,sbuf),
9196                                 dd_force_imb_perf_loss(dd));
9197                     }
9198                 }
9199                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9200                 if (bTurnOnDLB)
9201                 {
9202                     turn_on_dlb(fplog,cr,step);
9203                     bDoDLB = TRUE;
9204                 }
9205             }
9206         }
9207         comm->n_load_have++;
9208     }
9209
9210     cgs_gl = &comm->cgs_gl;
9211
9212     bRedist = FALSE;
9213     if (bMasterState)
9214     {
9215         /* Clear the old state */
9216         clear_dd_indices(dd,0,0);
9217
9218         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9219                   TRUE,cgs_gl,state_global->x,&ddbox);
9220
9221         get_cg_distribution(fplog,step,dd,cgs_gl,
9222                             state_global->box,&ddbox,state_global->x);
9223
9224         dd_distribute_state(dd,cgs_gl,
9225                             state_global,state_local,f);
9226
9227         dd_make_local_cgs(dd,&top_local->cgs);
9228
9229         /* Ensure that we have space for the new distribution */
9230         dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9231
9232         if (fr->cutoff_scheme == ecutsGROUP)
9233         {
9234             calc_cgcm(fplog,0,dd->ncg_home,
9235                       &top_local->cgs,state_local->x,fr->cg_cm);
9236         }
9237
9238         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9239
9240         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9241
9242         cg0 = 0;
9243     }
9244     else if (state_local->ddp_count != dd->ddp_count)
9245     {
9246         if (state_local->ddp_count > dd->ddp_count)
9247         {
9248             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9249         }
9250
9251         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9252         {
9253             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9254         }
9255
9256         /* Clear the old state */
9257         clear_dd_indices(dd,0,0);
9258
9259         /* Build the new indices */
9260         rebuild_cgindex(dd,cgs_gl->index,state_local);
9261         make_dd_indices(dd,cgs_gl->index,0);
9262
9263         if (fr->cutoff_scheme == ecutsGROUP)
9264         {
9265             /* Redetermine the cg COMs */
9266             calc_cgcm(fplog,0,dd->ncg_home,
9267                       &top_local->cgs,state_local->x,fr->cg_cm);
9268         }
9269
9270         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9271
9272         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9273
9274         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9275                   TRUE,&top_local->cgs,state_local->x,&ddbox);
9276
9277         bRedist = comm->bDynLoadBal;
9278     }
9279     else
9280     {
9281         /* We have the full state, only redistribute the cgs */
9282
9283         /* Clear the non-home indices */
9284         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9285
9286         /* Avoid global communication for dim's without pbc and -gcom */
9287         if (!bNStGlobalComm)
9288         {
9289             copy_rvec(comm->box0    ,ddbox.box0    );
9290             copy_rvec(comm->box_size,ddbox.box_size);
9291         }
9292         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9293                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9294
9295         bBoxChanged = TRUE;
9296         bRedist = TRUE;
9297     }
9298     /* For dim's without pbc and -gcom */
9299     copy_rvec(ddbox.box0    ,comm->box0    );
9300     copy_rvec(ddbox.box_size,comm->box_size);
9301
9302     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9303                       step,wcycle);
9304
9305     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9306     {
9307         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9308     }
9309
9310     /* Check if we should sort the charge groups */
9311     if (comm->nstSortCG > 0)
9312     {
9313         bSortCG = (bMasterState ||
9314                    (bRedist && (step % comm->nstSortCG == 0)));
9315     }
9316     else
9317     {
9318         bSortCG = FALSE;
9319     }
9320
9321     ncg_home_old = dd->ncg_home;
9322
9323     ncg_moved = 0;
9324     if (bRedist)
9325     {
9326         wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9327
9328         dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9329                            state_local,f,fr,mdatoms,
9330                            !bSortCG,nrnb,&cg0,&ncg_moved);
9331
9332         wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9333     }
9334
9335     get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9336                           dd,&ddbox,
9337                           &comm->cell_x0,&comm->cell_x1,
9338                           dd->ncg_home,fr->cg_cm,
9339                           cell_ns_x0,cell_ns_x1,&grid_density);
9340
9341     if (bBoxChanged)
9342     {
9343         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9344     }
9345
9346     switch (fr->cutoff_scheme)
9347     {
9348     case ecutsGROUP:
9349         copy_ivec(fr->ns.grid->n,ncells_old);
9350         grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9351                    state_local->box,cell_ns_x0,cell_ns_x1,
9352                    fr->rlistlong,grid_density);
9353         break;
9354     case ecutsVERLET:
9355         nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9356         break;
9357     default:
9358         gmx_incons("unimplemented");
9359     }
9360     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9361     copy_ivec(ddbox.tric_dir,comm->tric_dir);
9362
9363     if (bSortCG)
9364     {
9365         wallcycle_sub_start(wcycle,ewcsDD_GRID);
9366
9367         /* Sort the state on charge group position.
9368          * This enables exact restarts from this step.
9369          * It also improves performance by about 15% with larger numbers
9370          * of atoms per node.
9371          */
9372
9373         /* Fill the ns grid with the home cell,
9374          * so we can sort with the indices.
9375          */
9376         set_zones_ncg_home(dd);
9377
9378         switch (fr->cutoff_scheme)
9379         {
9380         case ecutsVERLET:
9381             set_zones_size(dd,state_local->box,&ddbox,0,1);
9382
9383             nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9384                               0,
9385                               comm->zones.size[0].bb_x0,
9386                               comm->zones.size[0].bb_x1,
9387                               0,dd->ncg_home,
9388                               comm->zones.dens_zone0,
9389                               fr->cginfo,
9390                               state_local->x,
9391                               ncg_moved,bRedist ? comm->moved : NULL,
9392                               fr->nbv->grp[eintLocal].kernel_type,
9393                               fr->nbv->grp[eintLocal].nbat);
9394
9395             nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9396             break;
9397         case ecutsGROUP:
9398             fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9399                       0,dd->ncg_home,fr->cg_cm);
9400
9401             copy_ivec(fr->ns.grid->n,ncells_new);
9402             break;
9403         default:
9404             gmx_incons("unimplemented");
9405         }
9406
9407         bResortAll = bMasterState;
9408
9409         /* Check if we can user the old order and ns grid cell indices
9410          * of the charge groups to sort the charge groups efficiently.
9411          */
9412         if (ncells_new[XX] != ncells_old[XX] ||
9413             ncells_new[YY] != ncells_old[YY] ||
9414             ncells_new[ZZ] != ncells_old[ZZ])
9415         {
9416             bResortAll = TRUE;
9417         }
9418
9419         if (debug)
9420         {
9421             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9422                     gmx_step_str(step,sbuf),dd->ncg_home);
9423         }
9424         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9425                       bResortAll ? -1 : ncg_home_old);
9426         /* Rebuild all the indices */
9427         cg0 = 0;
9428         ga2la_clear(dd->ga2la);
9429
9430         wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9431     }
9432
9433     wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9434
9435     /* Setup up the communication and communicate the coordinates */
9436     setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9437
9438     /* Set the indices */
9439     make_dd_indices(dd,cgs_gl->index,cg0);
9440
9441     /* Set the charge group boundaries for neighbor searching */
9442     set_cg_boundaries(&comm->zones);
9443
9444     if (fr->cutoff_scheme == ecutsVERLET)
9445     {
9446         set_zones_size(dd,state_local->box,&ddbox,
9447                        bSortCG ? 1 : 0,comm->zones.n);
9448     }
9449
9450     wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9451
9452     /*
9453     write_dd_pdb("dd_home",step,"dump",top_global,cr,
9454                  -1,state_local->x,state_local->box);
9455     */
9456
9457     wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9458
9459     /* Extract a local topology from the global topology */
9460     for(i=0; i<dd->ndim; i++)
9461     {
9462         np[dd->dim[i]] = comm->cd[i].np;
9463     }
9464     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9465                       comm->cellsize_min,np,
9466                       fr,
9467                       fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9468                       vsite,top_global,top_local);
9469
9470     wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9471
9472     wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9473
9474     /* Set up the special atom communication */
9475     n = comm->nat[ddnatZONE];
9476     for(i=ddnatZONE+1; i<ddnatNR; i++)
9477     {
9478         switch(i)
9479         {
9480         case ddnatVSITE:
9481             if (vsite && vsite->n_intercg_vsite)
9482             {
9483                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9484             }
9485             break;
9486         case ddnatCON:
9487             if (dd->bInterCGcons || dd->bInterCGsettles)
9488             {
9489                 /* Only for inter-cg constraints we need special code */
9490                 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9491                                               constr,ir->nProjOrder,
9492                                               top_local->idef.il);
9493             }
9494             break;
9495         default:
9496             gmx_incons("Unknown special atom type setup");
9497         }
9498         comm->nat[i] = n;
9499     }
9500
9501     wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9502
9503     wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9504
9505     /* Make space for the extra coordinates for virtual site
9506      * or constraint communication.
9507      */
9508     state_local->natoms = comm->nat[ddnatNR-1];
9509     if (state_local->natoms > state_local->nalloc)
9510     {
9511         dd_realloc_state(state_local,f,state_local->natoms);
9512     }
9513
9514     if (fr->bF_NoVirSum)
9515     {
9516         if (vsite && vsite->n_intercg_vsite)
9517         {
9518             nat_f_novirsum = comm->nat[ddnatVSITE];
9519         }
9520         else
9521         {
9522             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9523             {
9524                 nat_f_novirsum = dd->nat_tot;
9525             }
9526             else
9527             {
9528                 nat_f_novirsum = dd->nat_home;
9529             }
9530         }
9531     }
9532     else
9533     {
9534         nat_f_novirsum = 0;
9535     }
9536
9537     /* Set the number of atoms required for the force calculation.
9538      * Forces need to be constrained when using a twin-range setup
9539      * or with energy minimization. For simple simulations we could
9540      * avoid some allocation, zeroing and copying, but this is
9541      * probably not worth the complications ande checking.
9542      */
9543     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9544                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9545
9546     /* We make the all mdatoms up to nat_tot_con.
9547      * We could save some work by only setting invmass
9548      * between nat_tot and nat_tot_con.
9549      */
9550     /* This call also sets the new number of home particles to dd->nat_home */
9551     atoms2md(top_global,ir,
9552              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9553
9554     /* Now we have the charges we can sort the FE interactions */
9555     dd_sort_local_top(dd,mdatoms,top_local);
9556
9557     if (vsite != NULL)
9558     {
9559         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9560         split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
9561     }
9562
9563     if (shellfc)
9564     {
9565         /* Make the local shell stuff, currently no communication is done */
9566         make_local_shells(cr,mdatoms,shellfc);
9567     }
9568
9569         if (ir->implicit_solvent)
9570     {
9571         make_local_gb(cr,fr->born,ir->gb_algorithm);
9572     }
9573
9574     init_bonded_thread_force_reduction(fr,&top_local->idef);
9575
9576     if (!(cr->duty & DUTY_PME))
9577     {
9578         /* Send the charges to our PME only node */
9579         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9580                        mdatoms->chargeA,mdatoms->chargeB,
9581                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9582     }
9583
9584     if (constr)
9585     {
9586         set_constraints(constr,top_local,ir,mdatoms,cr);
9587     }
9588
9589     if (ir->ePull != epullNO)
9590     {
9591         /* Update the local pull groups */
9592         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9593     }
9594
9595     if (ir->bRot)
9596     {
9597         /* Update the local rotation groups */
9598         dd_make_local_rotation_groups(dd,ir->rot);
9599     }
9600
9601
9602     add_dd_statistics(dd);
9603
9604     /* Make sure we only count the cycles for this DD partitioning */
9605     clear_dd_cycle_counts(dd);
9606
9607     /* Because the order of the atoms might have changed since
9608      * the last vsite construction, we need to communicate the constructing
9609      * atom coordinates again (for spreading the forces this MD step).
9610      */
9611     dd_move_x_vsites(dd,state_local->box,state_local->x);
9612
9613     wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9614
9615     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9616     {
9617         dd_move_x(dd,state_local->box,state_local->x);
9618         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9619                      -1,state_local->x,state_local->box);
9620     }
9621
9622     /* Store the partitioning step */
9623     comm->partition_step = step;
9624
9625     /* Increase the DD partitioning counter */
9626     dd->ddp_count++;
9627     /* The state currently matches this DD partitioning count, store it */
9628     state_local->ddp_count = dd->ddp_count;
9629     if (bMasterState)
9630     {
9631         /* The DD master node knows the complete cg distribution,
9632          * store the count so we can possibly skip the cg info communication.
9633          */
9634         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9635     }
9636
9637     if (comm->DD_debug > 0)
9638     {
9639         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9640         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9641                                 "after partitioning");
9642     }
9643 }