src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/domdec/ga2la.h"
  53 #include "gromacs/ewald/pme.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/chargegroup.h"
  57 #include "gromacs/gmxlib/network.h"
  58 #include "gromacs/gmxlib/nrnb.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/hardware/hw_info.h"
  61 #include "gromacs/imd/imd.h"
  62 #include "gromacs/listed-forces/manage-threading.h"
  63 #include "gromacs/math/functions.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/constr.h"
  67 #include "gromacs/mdlib/force.h"
  68 #include "gromacs/mdlib/forcerec.h"
  69 #include "gromacs/mdlib/genborn.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/mdrun.h"
  73 #include "gromacs/mdlib/mdsetup.h"
  74 #include "gromacs/mdlib/nb_verlet.h"
  75 #include "gromacs/mdlib/nbnxn_grid.h"
  76 #include "gromacs/mdlib/nsgrid.h"
  77 #include "gromacs/mdlib/vsite.h"
  78 #include "gromacs/mdtypes/commrec.h"
  79 #include "gromacs/mdtypes/df_history.h"
  80 #include "gromacs/mdtypes/forcerec.h"
  81 #include "gromacs/mdtypes/inputrec.h"
  82 #include "gromacs/mdtypes/md_enums.h"
  83 #include "gromacs/mdtypes/mdatom.h"
  84 #include "gromacs/mdtypes/nblist.h"
  85 #include "gromacs/mdtypes/state.h"
  86 #include "gromacs/pbcutil/ishift.h"
  87 #include "gromacs/pbcutil/pbc.h"
  88 #include "gromacs/pulling/pull.h"
  89 #include "gromacs/pulling/pull_rotation.h"
  90 #include "gromacs/swap/swapcoords.h"
  91 #include "gromacs/timing/wallcycle.h"
  92 #include "gromacs/topology/block.h"
  93 #include "gromacs/topology/idef.h"
  94 #include "gromacs/topology/ifunc.h"
  95 #include "gromacs/topology/mtop_lookup.h"
  96 #include "gromacs/topology/mtop_util.h"
  97 #include "gromacs/topology/topology.h"
  98 #include "gromacs/utility/basedefinitions.h"
  99 #include "gromacs/utility/basenetwork.h"
 100 #include "gromacs/utility/cstringutil.h"
 101 #include "gromacs/utility/exceptions.h"
 102 #include "gromacs/utility/fatalerror.h"
 103 #include "gromacs/utility/gmxmpi.h"
 104 #include "gromacs/utility/qsort_threadsafe.h"
 105 #include "gromacs/utility/real.h"
 106 #include "gromacs/utility/smalloc.h"
 107
 108 #include "domdec_constraints.h"
 109 #include "domdec_internal.h"
 110 #include "domdec_vsite.h"
 111
 112 #define DDRANK(dd, rank)    (rank)
 113 #define DDMASTERRANK(dd)   (dd->masterrank)
 114
 115 struct gmx_domdec_master_t
 116 {
 117     /* The cell boundaries */
 118     real **cell_x;
 119     /* The global charge group division */
 120     int   *ncg;    /* Number of home charge groups for each node */
 121     int   *index;  /* Index of nnodes+1 into cg */
 122     int   *cg;     /* Global charge group index */
 123     int   *nat;    /* Number of home atoms for each node. */
 124     int   *ibuf;   /* Buffer for communication */
 125     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 126 };
 127
 128 #define DD_NLOAD_MAX 9
 129
 130 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
 131
 132 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 133 #define DD_CGIBS 2
 134
 135 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 136 #define DD_FLAG_NRCG  65535
 137 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 138 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 139
 140 /* The DD zone order */
 141 static const ivec dd_zo[DD_MAXZONE] =
 142 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 143
 144 /* The non-bonded zone-pair setup for domain decomposition
 145  * The first number is the i-zone, the second number the first j-zone seen by
 146  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 147  * As is, this is for 3D decomposition, where there are 4 i-zones.
 148  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 149  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 150  */
 151 static const int
 152     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 153                                                  {1, 3, 6},
 154                                                  {2, 5, 6},
 155                                                  {3, 5, 7}};
 156
 157 /* Factors used to avoid problems due to rounding issues */
 158 #define DD_CELL_MARGIN       1.0001
 159 #define DD_CELL_MARGIN2      1.00005
 160 /* Factor to account for pressure scaling during nstlist steps */
 161 #define DD_PRES_SCALE_MARGIN 1.02
 162
 163 /* Turn on DLB when the load imbalance causes this amount of total loss.
 164  * There is a bit of overhead with DLB and it's difficult to achieve
 165  * a load imbalance of less than 2% with DLB.
 166  */
 167 #define DD_PERF_LOSS_DLB_ON  0.02
 168
 169 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 170 #define DD_PERF_LOSS_WARN    0.05
 171
 172 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 173
 174 /* Use separate MPI send and receive commands
 175  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 176  * This saves memory (and some copying for small nnodes).
 177  * For high parallelization scatter and gather calls are used.
 178  */
 179 #define GMX_DD_NNODES_SENDRECV 4
 180
 181
 182 /* We check if to turn on DLB at the first and every 100 DD partitionings.
 183  * With large imbalance DLB will turn on at the first step, so we can
 184  * make the interval so large that the MPI overhead of the check is negligible.
 185  */
 186 static const int c_checkTurnDlbOnInterval  = 100;
 187 /* We need to check if DLB results in worse performance and then turn it off.
 188  * We check this more often then for turning DLB on, because the DLB can scale
 189  * the domains very rapidly, so if unlucky the load imbalance can go up quickly
 190  * and furthermore, we are already synchronizing often with DLB, so
 191  * the overhead of the MPI Bcast is not that high.
 192  */
 193 static const int c_checkTurnDlbOffInterval =  20;
 194
 195 /* Forward declaration */
 196 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
 197
 198
 199 /*
 200    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 201
 202    static void index2xyz(ivec nc,int ind,ivec xyz)
 203    {
 204    xyz[XX] = ind % nc[XX];
 205    xyz[YY] = (ind / nc[XX]) % nc[YY];
 206    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 207    }
 208  */
 209
 210 /* This order is required to minimize the coordinate communication in PME
 211  * which uses decomposition in the x direction.
 212  */
 213 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 214
 215 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 216 {
 217     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 218     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 219     xyz[ZZ] = ind % nc[ZZ];
 220 }
 221
 222 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 223 {
 224     int ddindex;
 225     int ddnodeid = -1;
 226
 227     ddindex = dd_index(dd->nc, c);
 228     if (dd->comm->bCartesianPP_PME)
 229     {
 230         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 231     }
 232     else if (dd->comm->bCartesianPP)
 233     {
 234 #if GMX_MPI
 235         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 236 #endif
 237     }
 238     else
 239     {
 240         ddnodeid = ddindex;
 241     }
 242
 243     return ddnodeid;
 244 }
 245
 246 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
 247 {
 248     return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
 249 }
 250
 251 int ddglatnr(const gmx_domdec_t *dd, int i)
 252 {
 253     int atnr;
 254
 255     if (dd == NULL)
 256     {
 257         atnr = i + 1;
 258     }
 259     else
 260     {
 261         if (i >= dd->comm->nat[ddnatNR-1])
 262         {
 263             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 264         }
 265         atnr = dd->gatindex[i] + 1;
 266     }
 267
 268     return atnr;
 269 }
 270
 271 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 272 {
 273     return &dd->comm->cgs_gl;
 274 }
 275
 276 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
 277 {
 278     return (comm->dlbState == edlbsOnCanTurnOff ||
 279             comm->dlbState == edlbsOnForever);
 280 }
 281
 282 static void vec_rvec_init(vec_rvec_t *v)
 283 {
 284     v->nalloc = 0;
 285     v->v      = NULL;
 286 }
 287
 288 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 289 {
 290     if (n > v->nalloc)
 291     {
 292         v->nalloc = over_alloc_dd(n);
 293         srenew(v->v, v->nalloc);
 294     }
 295 }
 296
 297 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 298 {
 299     int i;
 300
 301     if (state->ddp_count != dd->ddp_count)
 302     {
 303         gmx_incons("The state does not the domain decomposition state");
 304     }
 305
 306     state->cg_gl.resize(dd->ncg_home);
 307     for (i = 0; i < dd->ncg_home; i++)
 308     {
 309         state->cg_gl[i] = dd->index_gl[i];
 310     }
 311
 312     state->ddp_count_cg_gl = dd->ddp_count;
 313 }
 314
 315 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 316 {
 317     return &dd->comm->zones;
 318 }
 319
 320 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 321                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 322 {
 323     gmx_domdec_zones_t *zones;
 324     int                 izone, d, dim;
 325
 326     zones = &dd->comm->zones;
 327
 328     izone = 0;
 329     while (icg >= zones->izone[izone].cg1)
 330     {
 331         izone++;
 332     }
 333
 334     if (izone == 0)
 335     {
 336         *jcg0 = icg;
 337     }
 338     else if (izone < zones->nizone)
 339     {
 340         *jcg0 = zones->izone[izone].jcg0;
 341     }
 342     else
 343     {
 344         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 345                   icg, izone, zones->nizone);
 346     }
 347
 348     *jcg1 = zones->izone[izone].jcg1;
 349
 350     for (d = 0; d < dd->ndim; d++)
 351     {
 352         dim         = dd->dim[d];
 353         shift0[dim] = zones->izone[izone].shift0[dim];
 354         shift1[dim] = zones->izone[izone].shift1[dim];
 355         if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
 356         {
 357             /* A conservative approach, this can be optimized */
 358             shift0[dim] -= 1;
 359             shift1[dim] += 1;
 360         }
 361     }
 362 }
 363
 364 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 365 {
 366     /* We currently set mdatoms entries for all atoms:
 367      * local + non-local + communicated for vsite + constraints
 368      */
 369
 370     return dd->comm->nat[ddnatNR - 1];
 371 }
 372
 373 int dd_natoms_vsite(const gmx_domdec_t *dd)
 374 {
 375     return dd->comm->nat[ddnatVSITE];
 376 }
 377
 378 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 379 {
 380     *at_start = dd->comm->nat[ddnatCON-1];
 381     *at_end   = dd->comm->nat[ddnatCON];
 382 }
 383
 384 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 385 {
 386     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 387     int                   *index, *cgindex;
 388     gmx_domdec_comm_t     *comm;
 389     gmx_domdec_comm_dim_t *cd;
 390     gmx_domdec_ind_t      *ind;
 391     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 392     gmx_bool               bPBC, bScrew;
 393
 394     comm = dd->comm;
 395
 396     cgindex = dd->cgindex;
 397
 398     buf = comm->vbuf.v;
 399
 400     nzone   = 1;
 401     nat_tot = dd->nat_home;
 402     for (d = 0; d < dd->ndim; d++)
 403     {
 404         bPBC   = (dd->ci[dd->dim[d]] == 0);
 405         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 406         if (bPBC)
 407         {
 408             copy_rvec(box[dd->dim[d]], shift);
 409         }
 410         cd = &comm->cd[d];
 411         for (p = 0; p < cd->np; p++)
 412         {
 413             ind   = &cd->ind[p];
 414             index = ind->index;
 415             n     = 0;
 416             if (!bPBC)
 417             {
 418                 for (i = 0; i < ind->nsend[nzone]; i++)
 419                 {
 420                     at0 = cgindex[index[i]];
 421                     at1 = cgindex[index[i]+1];
 422                     for (j = at0; j < at1; j++)
 423                     {
 424                         copy_rvec(x[j], buf[n]);
 425                         n++;
 426                     }
 427                 }
 428             }
 429             else if (!bScrew)
 430             {
 431                 for (i = 0; i < ind->nsend[nzone]; i++)
 432                 {
 433                     at0 = cgindex[index[i]];
 434                     at1 = cgindex[index[i]+1];
 435                     for (j = at0; j < at1; j++)
 436                     {
 437                         /* We need to shift the coordinates */
 438                         rvec_add(x[j], shift, buf[n]);
 439                         n++;
 440                     }
 441                 }
 442             }
 443             else
 444             {
 445                 for (i = 0; i < ind->nsend[nzone]; i++)
 446                 {
 447                     at0 = cgindex[index[i]];
 448                     at1 = cgindex[index[i]+1];
 449                     for (j = at0; j < at1; j++)
 450                     {
 451                         /* Shift x */
 452                         buf[n][XX] = x[j][XX] + shift[XX];
 453                         /* Rotate y and z.
 454                          * This operation requires a special shift force
 455                          * treatment, which is performed in calc_vir.
 456                          */
 457                         buf[n][YY] = box[YY][YY] - x[j][YY];
 458                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 459                         n++;
 460                     }
 461                 }
 462             }
 463
 464             if (cd->bInPlace)
 465             {
 466                 rbuf = x + nat_tot;
 467             }
 468             else
 469             {
 470                 rbuf = comm->vbuf2.v;
 471             }
 472             /* Send and receive the coordinates */
 473             dd_sendrecv_rvec(dd, d, dddirBackward,
 474                              buf,  ind->nsend[nzone+1],
 475                              rbuf, ind->nrecv[nzone+1]);
 476             if (!cd->bInPlace)
 477             {
 478                 j = 0;
 479                 for (zone = 0; zone < nzone; zone++)
 480                 {
 481                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 482                     {
 483                         copy_rvec(rbuf[j], x[i]);
 484                         j++;
 485                     }
 486                 }
 487             }
 488             nat_tot += ind->nrecv[nzone+1];
 489         }
 490         nzone += nzone;
 491     }
 492 }
 493
 494 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 495 {
 496     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 497     int                   *index, *cgindex;
 498     gmx_domdec_comm_t     *comm;
 499     gmx_domdec_comm_dim_t *cd;
 500     gmx_domdec_ind_t      *ind;
 501     rvec                  *buf, *sbuf;
 502     ivec                   vis;
 503     int                    is;
 504     gmx_bool               bShiftForcesNeedPbc, bScrew;
 505
 506     comm = dd->comm;
 507
 508     cgindex = dd->cgindex;
 509
 510     buf = comm->vbuf.v;
 511
 512     nzone   = comm->zones.n/2;
 513     nat_tot = dd->nat_tot;
 514     for (d = dd->ndim-1; d >= 0; d--)
 515     {
 516         /* Only forces in domains near the PBC boundaries need to
 517            consider PBC in the treatment of fshift */
 518         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 519         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 520         if (fshift == NULL && !bScrew)
 521         {
 522             bShiftForcesNeedPbc = FALSE;
 523         }
 524         /* Determine which shift vector we need */
 525         clear_ivec(vis);
 526         vis[dd->dim[d]] = 1;
 527         is              = IVEC2IS(vis);
 528
 529         cd = &comm->cd[d];
 530         for (p = cd->np-1; p >= 0; p--)
 531         {
 532             ind      = &cd->ind[p];
 533             nat_tot -= ind->nrecv[nzone+1];
 534             if (cd->bInPlace)
 535             {
 536                 sbuf = f + nat_tot;
 537             }
 538             else
 539             {
 540                 sbuf = comm->vbuf2.v;
 541                 j    = 0;
 542                 for (zone = 0; zone < nzone; zone++)
 543                 {
 544                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 545                     {
 546                         copy_rvec(f[i], sbuf[j]);
 547                         j++;
 548                     }
 549                 }
 550             }
 551             /* Communicate the forces */
 552             dd_sendrecv_rvec(dd, d, dddirForward,
 553                              sbuf, ind->nrecv[nzone+1],
 554                              buf,  ind->nsend[nzone+1]);
 555             index = ind->index;
 556             /* Add the received forces */
 557             n = 0;
 558             if (!bShiftForcesNeedPbc)
 559             {
 560                 for (i = 0; i < ind->nsend[nzone]; i++)
 561                 {
 562                     at0 = cgindex[index[i]];
 563                     at1 = cgindex[index[i]+1];
 564                     for (j = at0; j < at1; j++)
 565                     {
 566                         rvec_inc(f[j], buf[n]);
 567                         n++;
 568                     }
 569                 }
 570             }
 571             else if (!bScrew)
 572             {
 573                 /* fshift should always be defined if this function is
 574                  * called when bShiftForcesNeedPbc is true */
 575                 assert(NULL != fshift);
 576                 for (i = 0; i < ind->nsend[nzone]; i++)
 577                 {
 578                     at0 = cgindex[index[i]];
 579                     at1 = cgindex[index[i]+1];
 580                     for (j = at0; j < at1; j++)
 581                     {
 582                         rvec_inc(f[j], buf[n]);
 583                         /* Add this force to the shift force */
 584                         rvec_inc(fshift[is], buf[n]);
 585                         n++;
 586                     }
 587                 }
 588             }
 589             else
 590             {
 591                 for (i = 0; i < ind->nsend[nzone]; i++)
 592                 {
 593                     at0 = cgindex[index[i]];
 594                     at1 = cgindex[index[i]+1];
 595                     for (j = at0; j < at1; j++)
 596                     {
 597                         /* Rotate the force */
 598                         f[j][XX] += buf[n][XX];
 599                         f[j][YY] -= buf[n][YY];
 600                         f[j][ZZ] -= buf[n][ZZ];
 601                         if (fshift)
 602                         {
 603                             /* Add this force to the shift force */
 604                             rvec_inc(fshift[is], buf[n]);
 605                         }
 606                         n++;
 607                     }
 608                 }
 609             }
 610         }
 611         nzone /= 2;
 612     }
 613 }
 614
 615 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 616 {
 617     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 618     int                   *index, *cgindex;
 619     gmx_domdec_comm_t     *comm;
 620     gmx_domdec_comm_dim_t *cd;
 621     gmx_domdec_ind_t      *ind;
 622     real                  *buf, *rbuf;
 623
 624     comm = dd->comm;
 625
 626     cgindex = dd->cgindex;
 627
 628     buf = &comm->vbuf.v[0][0];
 629
 630     nzone   = 1;
 631     nat_tot = dd->nat_home;
 632     for (d = 0; d < dd->ndim; d++)
 633     {
 634         cd = &comm->cd[d];
 635         for (p = 0; p < cd->np; p++)
 636         {
 637             ind   = &cd->ind[p];
 638             index = ind->index;
 639             n     = 0;
 640             for (i = 0; i < ind->nsend[nzone]; i++)
 641             {
 642                 at0 = cgindex[index[i]];
 643                 at1 = cgindex[index[i]+1];
 644                 for (j = at0; j < at1; j++)
 645                 {
 646                     buf[n] = v[j];
 647                     n++;
 648                 }
 649             }
 650
 651             if (cd->bInPlace)
 652             {
 653                 rbuf = v + nat_tot;
 654             }
 655             else
 656             {
 657                 rbuf = &comm->vbuf2.v[0][0];
 658             }
 659             /* Send and receive the coordinates */
 660             dd_sendrecv_real(dd, d, dddirBackward,
 661                              buf,  ind->nsend[nzone+1],
 662                              rbuf, ind->nrecv[nzone+1]);
 663             if (!cd->bInPlace)
 664             {
 665                 j = 0;
 666                 for (zone = 0; zone < nzone; zone++)
 667                 {
 668                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 669                     {
 670                         v[i] = rbuf[j];
 671                         j++;
 672                     }
 673                 }
 674             }
 675             nat_tot += ind->nrecv[nzone+1];
 676         }
 677         nzone += nzone;
 678     }
 679 }
 680
 681 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 682 {
 683     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 684     int                   *index, *cgindex;
 685     gmx_domdec_comm_t     *comm;
 686     gmx_domdec_comm_dim_t *cd;
 687     gmx_domdec_ind_t      *ind;
 688     real                  *buf, *sbuf;
 689
 690     comm = dd->comm;
 691
 692     cgindex = dd->cgindex;
 693
 694     buf = &comm->vbuf.v[0][0];
 695
 696     nzone   = comm->zones.n/2;
 697     nat_tot = dd->nat_tot;
 698     for (d = dd->ndim-1; d >= 0; d--)
 699     {
 700         cd = &comm->cd[d];
 701         for (p = cd->np-1; p >= 0; p--)
 702         {
 703             ind      = &cd->ind[p];
 704             nat_tot -= ind->nrecv[nzone+1];
 705             if (cd->bInPlace)
 706             {
 707                 sbuf = v + nat_tot;
 708             }
 709             else
 710             {
 711                 sbuf = &comm->vbuf2.v[0][0];
 712                 j    = 0;
 713                 for (zone = 0; zone < nzone; zone++)
 714                 {
 715                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 716                     {
 717                         sbuf[j] = v[i];
 718                         j++;
 719                     }
 720                 }
 721             }
 722             /* Communicate the forces */
 723             dd_sendrecv_real(dd, d, dddirForward,
 724                              sbuf, ind->nrecv[nzone+1],
 725                              buf,  ind->nsend[nzone+1]);
 726             index = ind->index;
 727             /* Add the received forces */
 728             n = 0;
 729             for (i = 0; i < ind->nsend[nzone]; i++)
 730             {
 731                 at0 = cgindex[index[i]];
 732                 at1 = cgindex[index[i]+1];
 733                 for (j = at0; j < at1; j++)
 734                 {
 735                     v[j] += buf[n];
 736                     n++;
 737                 }
 738             }
 739         }
 740         nzone /= 2;
 741     }
 742 }
 743
 744 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 745 {
 746     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 747             d, i, j,
 748             zone->min0, zone->max1,
 749             zone->mch0, zone->mch0,
 750             zone->p1_0, zone->p1_1);
 751 }
 752
 753
 754 #define DDZONECOMM_MAXZONE  5
 755 #define DDZONECOMM_BUFSIZE  3
 756
 757 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 758                                int ddimind, int direction,
 759                                gmx_ddzone_t *buf_s, int n_s,
 760                                gmx_ddzone_t *buf_r, int n_r)
 761 {
 762 #define ZBS  DDZONECOMM_BUFSIZE
 763     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 764     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 765     int  i;
 766
 767     for (i = 0; i < n_s; i++)
 768     {
 769         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 770         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 771         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 772         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 773         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 774         vbuf_s[i*ZBS+1][2] = 0;
 775         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 776         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 777         vbuf_s[i*ZBS+2][2] = 0;
 778     }
 779
 780     dd_sendrecv_rvec(dd, ddimind, direction,
 781                      vbuf_s, n_s*ZBS,
 782                      vbuf_r, n_r*ZBS);
 783
 784     for (i = 0; i < n_r; i++)
 785     {
 786         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 787         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 788         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 789         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 790         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 791         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 792         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 793     }
 794
 795 #undef ZBS
 796 }
 797
 798 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 799                           rvec cell_ns_x0, rvec cell_ns_x1)
 800 {
 801     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
 802     gmx_ddzone_t      *zp;
 803     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 804     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 805     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 806     rvec               extr_s[2], extr_r[2];
 807     rvec               dh;
 808     real               dist_d, c = 0, det;
 809     gmx_domdec_comm_t *comm;
 810     gmx_bool           bPBC, bUse;
 811
 812     comm = dd->comm;
 813
 814     for (d = 1; d < dd->ndim; d++)
 815     {
 816         dim      = dd->dim[d];
 817         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 818         zp->min0 = cell_ns_x0[dim];
 819         zp->max1 = cell_ns_x1[dim];
 820         zp->min1 = cell_ns_x1[dim];
 821         zp->mch0 = cell_ns_x0[dim];
 822         zp->mch1 = cell_ns_x1[dim];
 823         zp->p1_0 = cell_ns_x0[dim];
 824         zp->p1_1 = cell_ns_x1[dim];
 825     }
 826
 827     for (d = dd->ndim-2; d >= 0; d--)
 828     {
 829         dim  = dd->dim[d];
 830         bPBC = (dim < ddbox->npbcdim);
 831
 832         /* Use an rvec to store two reals */
 833         extr_s[d][0] = comm->cell_f0[d+1];
 834         extr_s[d][1] = comm->cell_f1[d+1];
 835         extr_s[d][2] = comm->cell_f1[d+1];
 836
 837         pos = 0;
 838         /* Store the extremes in the backward sending buffer,
 839          * so the get updated separately from the forward communication.
 840          */
 841         for (d1 = d; d1 < dd->ndim-1; d1++)
 842         {
 843             /* We invert the order to be able to use the same loop for buf_e */
 844             buf_s[pos].min0 = extr_s[d1][1];
 845             buf_s[pos].max1 = extr_s[d1][0];
 846             buf_s[pos].min1 = extr_s[d1][2];
 847             buf_s[pos].mch0 = 0;
 848             buf_s[pos].mch1 = 0;
 849             /* Store the cell corner of the dimension we communicate along */
 850             buf_s[pos].p1_0 = comm->cell_x0[dim];
 851             buf_s[pos].p1_1 = 0;
 852             pos++;
 853         }
 854
 855         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 856         pos++;
 857
 858         if (dd->ndim == 3 && d == 0)
 859         {
 860             buf_s[pos] = comm->zone_d2[0][1];
 861             pos++;
 862             buf_s[pos] = comm->zone_d1[0];
 863             pos++;
 864         }
 865
 866         /* We only need to communicate the extremes
 867          * in the forward direction
 868          */
 869         npulse = comm->cd[d].np;
 870         if (bPBC)
 871         {
 872             /* Take the minimum to avoid double communication */
 873             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
 874         }
 875         else
 876         {
 877             /* Without PBC we should really not communicate over
 878              * the boundaries, but implementing that complicates
 879              * the communication setup and therefore we simply
 880              * do all communication, but ignore some data.
 881              */
 882             npulse_min = npulse;
 883         }
 884         for (p = 0; p < npulse_min; p++)
 885         {
 886             /* Communicate the extremes forward */
 887             bUse = (bPBC || dd->ci[dim] > 0);
 888
 889             dd_sendrecv_rvec(dd, d, dddirForward,
 890                              extr_s+d, dd->ndim-d-1,
 891                              extr_r+d, dd->ndim-d-1);
 892
 893             if (bUse)
 894             {
 895                 for (d1 = d; d1 < dd->ndim-1; d1++)
 896                 {
 897                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 898                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 899                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 900                 }
 901             }
 902         }
 903
 904         buf_size = pos;
 905         for (p = 0; p < npulse; p++)
 906         {
 907             /* Communicate all the zone information backward */
 908             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 909
 910             dd_sendrecv_ddzone(dd, d, dddirBackward,
 911                                buf_s, buf_size,
 912                                buf_r, buf_size);
 913
 914             clear_rvec(dh);
 915             if (p > 0)
 916             {
 917                 for (d1 = d+1; d1 < dd->ndim; d1++)
 918                 {
 919                     /* Determine the decrease of maximum required
 920                      * communication height along d1 due to the distance along d,
 921                      * this avoids a lot of useless atom communication.
 922                      */
 923                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 924
 925                     if (ddbox->tric_dir[dim])
 926                     {
 927                         /* c is the off-diagonal coupling between the cell planes
 928                          * along directions d and d1.
 929                          */
 930                         c = ddbox->v[dim][dd->dim[d1]][dim];
 931                     }
 932                     else
 933                     {
 934                         c = 0;
 935                     }
 936                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 937                     if (det > 0)
 938                     {
 939                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 940                     }
 941                     else
 942                     {
 943                         /* A negative value signals out of range */
 944                         dh[d1] = -1;
 945                     }
 946                 }
 947             }
 948
 949             /* Accumulate the extremes over all pulses */
 950             for (i = 0; i < buf_size; i++)
 951             {
 952                 if (p == 0)
 953                 {
 954                     buf_e[i] = buf_r[i];
 955                 }
 956                 else
 957                 {
 958                     if (bUse)
 959                     {
 960                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 961                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 962                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 963                     }
 964
 965                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 966                     {
 967                         d1 = 1;
 968                     }
 969                     else
 970                     {
 971                         d1 = d + 1;
 972                     }
 973                     if (bUse && dh[d1] >= 0)
 974                     {
 975                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 976                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 977                     }
 978                 }
 979                 /* Copy the received buffer to the send buffer,
 980                  * to pass the data through with the next pulse.
 981                  */
 982                 buf_s[i] = buf_r[i];
 983             }
 984             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 985                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 986             {
 987                 /* Store the extremes */
 988                 pos = 0;
 989
 990                 for (d1 = d; d1 < dd->ndim-1; d1++)
 991                 {
 992                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 993                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 994                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 995                     pos++;
 996                 }
 997
 998                 if (d == 1 || (d == 0 && dd->ndim == 3))
 999                 {
1000                     for (i = d; i < 2; i++)
1001                     {
1002                         comm->zone_d2[1-d][i] = buf_e[pos];
1003                         pos++;
1004                     }
1005                 }
1006                 if (d == 0)
1007                 {
1008                     comm->zone_d1[1] = buf_e[pos];
1009                     pos++;
1010                 }
1011             }
1012         }
1013     }
1014
1015     if (dd->ndim >= 2)
1016     {
1017         dim = dd->dim[1];
1018         for (i = 0; i < 2; i++)
1019         {
1020             if (debug)
1021             {
1022                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1023             }
1024             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1025             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1026         }
1027     }
1028     if (dd->ndim >= 3)
1029     {
1030         dim = dd->dim[2];
1031         for (i = 0; i < 2; i++)
1032         {
1033             for (j = 0; j < 2; j++)
1034             {
1035                 if (debug)
1036                 {
1037                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1038                 }
1039                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1040                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1041             }
1042         }
1043     }
1044     for (d = 1; d < dd->ndim; d++)
1045     {
1046         comm->cell_f_max0[d] = extr_s[d-1][0];
1047         comm->cell_f_min1[d] = extr_s[d-1][1];
1048         if (debug)
1049         {
1050             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1051                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1052         }
1053     }
1054 }
1055
1056 static void dd_collect_cg(gmx_domdec_t *dd,
1057                           t_state      *state_local)
1058 {
1059     gmx_domdec_master_t *ma = NULL;
1060     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1061
1062     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1063     {
1064         /* The master has the correct distribution */
1065         return;
1066     }
1067
1068     if (state_local->ddp_count == dd->ddp_count)
1069     {
1070         /* The local state and DD are in sync, use the DD indices */
1071         ncg_home = dd->ncg_home;
1072         cg       = dd->index_gl;
1073         nat_home = dd->nat_home;
1074     }
1075     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1076     {
1077         /* The DD is out of sync with the local state, but we have stored
1078          * the cg indices with the local state, so we can use those.
1079          */
1080         t_block *cgs_gl;
1081
1082         cgs_gl = &dd->comm->cgs_gl;
1083
1084         ncg_home = state_local->cg_gl.size();
1085         cg       = state_local->cg_gl.data();
1086         nat_home = 0;
1087         for (i = 0; i < ncg_home; i++)
1088         {
1089             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1090         }
1091     }
1092     else
1093     {
1094         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1095     }
1096
1097     buf2[0] = ncg_home;
1098     buf2[1] = nat_home;
1099     if (DDMASTER(dd))
1100     {
1101         ma   = dd->ma;
1102         ibuf = ma->ibuf;
1103     }
1104     else
1105     {
1106         ibuf = NULL;
1107     }
1108     /* Collect the charge group and atom counts on the master */
1109     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1110
1111     if (DDMASTER(dd))
1112     {
1113         ma->index[0] = 0;
1114         for (i = 0; i < dd->nnodes; i++)
1115         {
1116             ma->ncg[i]     = ma->ibuf[2*i];
1117             ma->nat[i]     = ma->ibuf[2*i+1];
1118             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1119
1120         }
1121         /* Make byte counts and indices */
1122         for (i = 0; i < dd->nnodes; i++)
1123         {
1124             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1125             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1126         }
1127         if (debug)
1128         {
1129             fprintf(debug, "Initial charge group distribution: ");
1130             for (i = 0; i < dd->nnodes; i++)
1131             {
1132                 fprintf(debug, " %d", ma->ncg[i]);
1133             }
1134             fprintf(debug, "\n");
1135         }
1136     }
1137
1138     /* Collect the charge group indices on the master */
1139     dd_gatherv(dd,
1140                ncg_home*sizeof(int), cg,
1141                DDMASTER(dd) ? ma->ibuf : NULL,
1142                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1143                DDMASTER(dd) ? ma->cg : NULL);
1144
1145     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1146 }
1147
1148 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1149                                     const rvec *lv, rvec *v)
1150 {
1151     gmx_domdec_master_t *ma;
1152     int                  n, i, c, a, nalloc = 0;
1153     rvec                *buf = NULL;
1154     t_block             *cgs_gl;
1155
1156     ma = dd->ma;
1157
1158     if (!DDMASTER(dd))
1159     {
1160 #if GMX_MPI
1161         MPI_Send(const_cast<void *>(static_cast<const void *>(lv)), dd->nat_home*sizeof(rvec), MPI_BYTE,
1162                  DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1163 #endif
1164     }
1165     else
1166     {
1167         /* Copy the master coordinates to the global array */
1168         cgs_gl = &dd->comm->cgs_gl;
1169
1170         n = DDMASTERRANK(dd);
1171         a = 0;
1172         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1173         {
1174             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1175             {
1176                 copy_rvec(lv[a++], v[c]);
1177             }
1178         }
1179
1180         for (n = 0; n < dd->nnodes; n++)
1181         {
1182             if (n != dd->rank)
1183             {
1184                 if (ma->nat[n] > nalloc)
1185                 {
1186                     nalloc = over_alloc_dd(ma->nat[n]);
1187                     srenew(buf, nalloc);
1188                 }
1189 #if GMX_MPI
1190                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1191                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1192 #endif
1193                 a = 0;
1194                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1195                 {
1196                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1197                     {
1198                         copy_rvec(buf[a++], v[c]);
1199                     }
1200                 }
1201             }
1202         }
1203         sfree(buf);
1204     }
1205 }
1206
1207 static void get_commbuffer_counts(gmx_domdec_t *dd,
1208                                   int **counts, int **disps)
1209 {
1210     gmx_domdec_master_t *ma;
1211     int                  n;
1212
1213     ma = dd->ma;
1214
1215     /* Make the rvec count and displacment arrays */
1216     *counts  = ma->ibuf;
1217     *disps   = ma->ibuf + dd->nnodes;
1218     for (n = 0; n < dd->nnodes; n++)
1219     {
1220         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1221         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1222     }
1223 }
1224
1225 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1226                                    const rvec *lv, rvec *v)
1227 {
1228     gmx_domdec_master_t *ma;
1229     int                 *rcounts = NULL, *disps = NULL;
1230     int                  n, i, c, a;
1231     rvec                *buf = NULL;
1232     t_block             *cgs_gl;
1233
1234     ma = dd->ma;
1235
1236     if (DDMASTER(dd))
1237     {
1238         get_commbuffer_counts(dd, &rcounts, &disps);
1239
1240         buf = ma->vbuf;
1241     }
1242
1243     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1244
1245     if (DDMASTER(dd))
1246     {
1247         cgs_gl = &dd->comm->cgs_gl;
1248
1249         a = 0;
1250         for (n = 0; n < dd->nnodes; n++)
1251         {
1252             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1253             {
1254                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1255                 {
1256                     copy_rvec(buf[a++], v[c]);
1257                 }
1258             }
1259         }
1260     }
1261 }
1262
1263 void dd_collect_vec(gmx_domdec_t           *dd,
1264                     t_state                *state_local,
1265                     const PaddedRVecVector *localVector,
1266                     rvec                   *v)
1267 {
1268     dd_collect_cg(dd, state_local);
1269
1270     const rvec *lv = as_rvec_array(localVector->data());
1271
1272     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1273     {
1274         dd_collect_vec_sendrecv(dd, lv, v);
1275     }
1276     else
1277     {
1278         dd_collect_vec_gatherv(dd, lv, v);
1279     }
1280 }
1281
1282 void dd_collect_vec(gmx_domdec_t           *dd,
1283                     t_state                *state_local,
1284                     const PaddedRVecVector *localVector,
1285                     PaddedRVecVector       *vector)
1286 {
1287     dd_collect_vec(dd, state_local, localVector, as_rvec_array(vector->data()));
1288 }
1289
1290
1291 void dd_collect_state(gmx_domdec_t *dd,
1292                       t_state *state_local, t_state *state)
1293 {
1294     int est, i, j, nh;
1295
1296     nh = state->nhchainlength;
1297
1298     if (DDMASTER(dd))
1299     {
1300         for (i = 0; i < efptNR; i++)
1301         {
1302             state->lambda[i] = state_local->lambda[i];
1303         }
1304         state->fep_state = state_local->fep_state;
1305         state->veta      = state_local->veta;
1306         state->vol0      = state_local->vol0;
1307         copy_mat(state_local->box, state->box);
1308         copy_mat(state_local->boxv, state->boxv);
1309         copy_mat(state_local->svir_prev, state->svir_prev);
1310         copy_mat(state_local->fvir_prev, state->fvir_prev);
1311         copy_mat(state_local->pres_prev, state->pres_prev);
1312
1313         for (i = 0; i < state_local->ngtc; i++)
1314         {
1315             for (j = 0; j < nh; j++)
1316             {
1317                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1318                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1319             }
1320             state->therm_integral[i] = state_local->therm_integral[i];
1321         }
1322         for (i = 0; i < state_local->nnhpres; i++)
1323         {
1324             for (j = 0; j < nh; j++)
1325             {
1326                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1327                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1328             }
1329         }
1330     }
1331     for (est = 0; est < estNR; est++)
1332     {
1333         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1334         {
1335             switch (est)
1336             {
1337                 case estX:
1338                     dd_collect_vec(dd, state_local, &state_local->x, &state->x);
1339                     break;
1340                 case estV:
1341                     dd_collect_vec(dd, state_local, &state_local->v, &state->v);
1342                     break;
1343                 case est_SDX_NOTSUPPORTED:
1344                     break;
1345                 case estCGP:
1346                     dd_collect_vec(dd, state_local, &state_local->cg_p, &state->cg_p);
1347                     break;
1348                 case estDISRE_INITF:
1349                 case estDISRE_RM3TAV:
1350                 case estORIRE_INITF:
1351                 case estORIRE_DTAV:
1352                     break;
1353                 default:
1354                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1355             }
1356         }
1357     }
1358 }
1359
1360 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1361 {
1362     int est;
1363
1364     if (debug)
1365     {
1366         fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1367     }
1368
1369     for (est = 0; est < estNR; est++)
1370     {
1371         if (EST_DISTR(est) && (state->flags & (1<<est)))
1372         {
1373             /* We need to allocate one element extra, since we might use
1374              * (unaligned) 4-wide SIMD loads to access rvec entries.
1375              */
1376             switch (est)
1377             {
1378                 case estX:
1379                     state->x.resize(natoms + 1);
1380                     break;
1381                 case estV:
1382                     state->v.resize(natoms + 1);
1383                     break;
1384                 case est_SDX_NOTSUPPORTED:
1385                     break;
1386                 case estCGP:
1387                     state->cg_p.resize(natoms + 1);
1388                     break;
1389                 case estDISRE_INITF:
1390                 case estDISRE_RM3TAV:
1391                 case estORIRE_INITF:
1392                 case estORIRE_DTAV:
1393                     /* No reallocation required */
1394                     break;
1395                 default:
1396                     gmx_incons("Unknown state entry encountered in dd_resize_state");
1397             }
1398         }
1399     }
1400
1401     if (f != NULL)
1402     {
1403         (*f).resize(natoms + 1);
1404     }
1405 }
1406
1407 static void dd_check_alloc_ncg(t_forcerec       *fr,
1408                                t_state          *state,
1409                                PaddedRVecVector *f,
1410                                int               numChargeGroups)
1411 {
1412     if (numChargeGroups > fr->cg_nalloc)
1413     {
1414         if (debug)
1415         {
1416             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1417         }
1418         fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1419         srenew(fr->cginfo, fr->cg_nalloc);
1420         if (fr->cutoff_scheme == ecutsGROUP)
1421         {
1422             srenew(fr->cg_cm, fr->cg_nalloc);
1423         }
1424     }
1425     if (fr->cutoff_scheme == ecutsVERLET)
1426     {
1427         /* We don't use charge groups, we use x in state to set up
1428          * the atom communication.
1429          */
1430         dd_resize_state(state, f, numChargeGroups);
1431     }
1432 }
1433
1434 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1435                                        rvec *v, rvec *lv)
1436 {
1437     gmx_domdec_master_t *ma;
1438     int                  n, i, c, a, nalloc = 0;
1439     rvec                *buf = NULL;
1440
1441     if (DDMASTER(dd))
1442     {
1443         ma  = dd->ma;
1444
1445         for (n = 0; n < dd->nnodes; n++)
1446         {
1447             if (n != dd->rank)
1448             {
1449                 if (ma->nat[n] > nalloc)
1450                 {
1451                     nalloc = over_alloc_dd(ma->nat[n]);
1452                     srenew(buf, nalloc);
1453                 }
1454                 /* Use lv as a temporary buffer */
1455                 a = 0;
1456                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1457                 {
1458                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1459                     {
1460                         copy_rvec(v[c], buf[a++]);
1461                     }
1462                 }
1463                 if (a != ma->nat[n])
1464                 {
1465                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1466                               a, ma->nat[n]);
1467                 }
1468
1469 #if GMX_MPI
1470                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1471                          DDRANK(dd, n), n, dd->mpi_comm_all);
1472 #endif
1473             }
1474         }
1475         sfree(buf);
1476         n = DDMASTERRANK(dd);
1477         a = 0;
1478         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1479         {
1480             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1481             {
1482                 copy_rvec(v[c], lv[a++]);
1483             }
1484         }
1485     }
1486     else
1487     {
1488 #if GMX_MPI
1489         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1490                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1491 #endif
1492     }
1493 }
1494
1495 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1496                                        rvec *v, rvec *lv)
1497 {
1498     gmx_domdec_master_t *ma;
1499     int                 *scounts = NULL, *disps = NULL;
1500     int                  n, i, c, a;
1501     rvec                *buf = NULL;
1502
1503     if (DDMASTER(dd))
1504     {
1505         ma  = dd->ma;
1506
1507         get_commbuffer_counts(dd, &scounts, &disps);
1508
1509         buf = ma->vbuf;
1510         a   = 0;
1511         for (n = 0; n < dd->nnodes; n++)
1512         {
1513             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1514             {
1515                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1516                 {
1517                     copy_rvec(v[c], buf[a++]);
1518                 }
1519             }
1520         }
1521     }
1522
1523     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1524 }
1525
1526 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1527 {
1528     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1529     {
1530         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1531     }
1532     else
1533     {
1534         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1535     }
1536 }
1537
1538 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1539 {
1540     if (dfhist == NULL)
1541     {
1542         return;
1543     }
1544
1545     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1546     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1547     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1548
1549     if (dfhist->nlambda > 0)
1550     {
1551         int nlam = dfhist->nlambda;
1552         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1553         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1554         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1555         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1556         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1557         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1558
1559         for (int i = 0; i < nlam; i++)
1560         {
1561             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1562             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1563             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1564             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1565             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1566             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1567         }
1568     }
1569 }
1570
1571 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1572                                 t_state *state, t_state *state_local,
1573                                 PaddedRVecVector *f)
1574 {
1575     int  i, j, nh;
1576
1577     nh = state->nhchainlength;
1578
1579     if (DDMASTER(dd))
1580     {
1581         for (i = 0; i < efptNR; i++)
1582         {
1583             state_local->lambda[i] = state->lambda[i];
1584         }
1585         state_local->fep_state = state->fep_state;
1586         state_local->veta      = state->veta;
1587         state_local->vol0      = state->vol0;
1588         copy_mat(state->box, state_local->box);
1589         copy_mat(state->box_rel, state_local->box_rel);
1590         copy_mat(state->boxv, state_local->boxv);
1591         copy_mat(state->svir_prev, state_local->svir_prev);
1592         copy_mat(state->fvir_prev, state_local->fvir_prev);
1593         if (state->dfhist != NULL)
1594         {
1595             copy_df_history(state_local->dfhist, state->dfhist);
1596         }
1597         for (i = 0; i < state_local->ngtc; i++)
1598         {
1599             for (j = 0; j < nh; j++)
1600             {
1601                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1602                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1603             }
1604             state_local->therm_integral[i] = state->therm_integral[i];
1605         }
1606         for (i = 0; i < state_local->nnhpres; i++)
1607         {
1608             for (j = 0; j < nh; j++)
1609             {
1610                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1611                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1612             }
1613         }
1614     }
1615     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1616     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1617     dd_bcast(dd, sizeof(real), &state_local->veta);
1618     dd_bcast(dd, sizeof(real), &state_local->vol0);
1619     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1620     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1621     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1622     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1623     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1624     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1625     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1626     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1627     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1628     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1629
1630     /* communicate df_history -- required for restarting from checkpoint */
1631     dd_distribute_dfhist(dd, state_local->dfhist);
1632
1633     dd_resize_state(state_local, f, dd->nat_home);
1634
1635     for (i = 0; i < estNR; i++)
1636     {
1637         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1638         {
1639             switch (i)
1640             {
1641                 case estX:
1642                     dd_distribute_vec(dd, cgs, as_rvec_array(state->x.data()), as_rvec_array(state_local->x.data()));
1643                     break;
1644                 case estV:
1645                     dd_distribute_vec(dd, cgs, as_rvec_array(state->v.data()), as_rvec_array(state_local->v.data()));
1646                     break;
1647                 case est_SDX_NOTSUPPORTED:
1648                     break;
1649                 case estCGP:
1650                     dd_distribute_vec(dd, cgs, as_rvec_array(state->cg_p.data()), as_rvec_array(state_local->cg_p.data()));
1651                     break;
1652                 case estDISRE_INITF:
1653                 case estDISRE_RM3TAV:
1654                 case estORIRE_INITF:
1655                 case estORIRE_DTAV:
1656                     /* Not implemented yet */
1657                     break;
1658                 default:
1659                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1660             }
1661         }
1662     }
1663 }
1664
1665 static char dim2char(int dim)
1666 {
1667     char c = '?';
1668
1669     switch (dim)
1670     {
1671         case XX: c = 'X'; break;
1672         case YY: c = 'Y'; break;
1673         case ZZ: c = 'Z'; break;
1674         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1675     }
1676
1677     return c;
1678 }
1679
1680 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1681                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1682 {
1683     rvec   grid_s[2], *grid_r = NULL, cx, r;
1684     char   fname[STRLEN], buf[22];
1685     FILE  *out;
1686     int    a, i, d, z, y, x;
1687     matrix tric;
1688     real   vol;
1689
1690     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1691     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1692
1693     if (DDMASTER(dd))
1694     {
1695         snew(grid_r, 2*dd->nnodes);
1696     }
1697
1698     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1699
1700     if (DDMASTER(dd))
1701     {
1702         for (d = 0; d < DIM; d++)
1703         {
1704             for (i = 0; i < DIM; i++)
1705             {
1706                 if (d == i)
1707                 {
1708                     tric[d][i] = 1;
1709                 }
1710                 else
1711                 {
1712                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1713                     {
1714                         tric[d][i] = box[i][d]/box[i][i];
1715                     }
1716                     else
1717                     {
1718                         tric[d][i] = 0;
1719                     }
1720                 }
1721             }
1722         }
1723         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1724         out = gmx_fio_fopen(fname, "w");
1725         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1726         a = 1;
1727         for (i = 0; i < dd->nnodes; i++)
1728         {
1729             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1730             for (d = 0; d < DIM; d++)
1731             {
1732                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1733             }
1734             for (z = 0; z < 2; z++)
1735             {
1736                 for (y = 0; y < 2; y++)
1737                 {
1738                     for (x = 0; x < 2; x++)
1739                     {
1740                         cx[XX] = grid_r[i*2+x][XX];
1741                         cx[YY] = grid_r[i*2+y][YY];
1742                         cx[ZZ] = grid_r[i*2+z][ZZ];
1743                         mvmul(tric, cx, r);
1744                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1745                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1746                     }
1747                 }
1748             }
1749             for (d = 0; d < DIM; d++)
1750             {
1751                 for (x = 0; x < 4; x++)
1752                 {
1753                     switch (d)
1754                     {
1755                         case 0: y = 1 + i*8 + 2*x; break;
1756                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1757                         case 2: y = 1 + i*8 + x; break;
1758                     }
1759                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1760                 }
1761             }
1762         }
1763         gmx_fio_fclose(out);
1764         sfree(grid_r);
1765     }
1766 }
1767
1768 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1769                   const gmx_mtop_t *mtop, t_commrec *cr,
1770                   int natoms, rvec x[], matrix box)
1771 {
1772     char          fname[STRLEN], buf[22];
1773     FILE         *out;
1774     int           i, ii, resnr, c;
1775     const char   *atomname, *resname;
1776     real          b;
1777     gmx_domdec_t *dd;
1778
1779     dd = cr->dd;
1780     if (natoms == -1)
1781     {
1782         natoms = dd->comm->nat[ddnatVSITE];
1783     }
1784
1785     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1786
1787     out = gmx_fio_fopen(fname, "w");
1788
1789     fprintf(out, "TITLE     %s\n", title);
1790     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1791     int molb = 0;
1792     for (i = 0; i < natoms; i++)
1793     {
1794         ii = dd->gatindex[i];
1795         mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1796         if (i < dd->comm->nat[ddnatZONE])
1797         {
1798             c = 0;
1799             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1800             {
1801                 c++;
1802             }
1803             b = c;
1804         }
1805         else if (i < dd->comm->nat[ddnatVSITE])
1806         {
1807             b = dd->comm->zones.n;
1808         }
1809         else
1810         {
1811             b = dd->comm->zones.n + 1;
1812         }
1813         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1814                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1815     }
1816     fprintf(out, "TER\n");
1817
1818     gmx_fio_fclose(out);
1819 }
1820
1821 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1822 {
1823     gmx_domdec_comm_t *comm;
1824     int                di;
1825     real               r;
1826
1827     comm = dd->comm;
1828
1829     r = -1;
1830     if (comm->bInterCGBondeds)
1831     {
1832         if (comm->cutoff_mbody > 0)
1833         {
1834             r = comm->cutoff_mbody;
1835         }
1836         else
1837         {
1838             /* cutoff_mbody=0 means we do not have DLB */
1839             r = comm->cellsize_min[dd->dim[0]];
1840             for (di = 1; di < dd->ndim; di++)
1841             {
1842                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1843             }
1844             if (comm->bBondComm)
1845             {
1846                 r = std::max(r, comm->cutoff_mbody);
1847             }
1848             else
1849             {
1850                 r = std::min(r, comm->cutoff);
1851             }
1852         }
1853     }
1854
1855     return r;
1856 }
1857
1858 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1859 {
1860     real r_mb;
1861
1862     r_mb = dd_cutoff_multibody(dd);
1863
1864     return std::max(dd->comm->cutoff, r_mb);
1865 }
1866
1867
1868 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1869                                    ivec coord_pme)
1870 {
1871     int nc, ntot;
1872
1873     nc   = dd->nc[dd->comm->cartpmedim];
1874     ntot = dd->comm->ntot[dd->comm->cartpmedim];
1875     copy_ivec(coord, coord_pme);
1876     coord_pme[dd->comm->cartpmedim] =
1877         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1878 }
1879
1880 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1881 {
1882     int npp, npme;
1883
1884     npp  = dd->nnodes;
1885     npme = dd->comm->npmenodes;
1886
1887     /* Here we assign a PME node to communicate with this DD node
1888      * by assuming that the major index of both is x.
1889      * We add cr->npmenodes/2 to obtain an even distribution.
1890      */
1891     return (ddindex*npme + npme/2)/npp;
1892 }
1893
1894 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1895 {
1896     int *pme_rank;
1897     int  n, i, p0, p1;
1898
1899     snew(pme_rank, dd->comm->npmenodes);
1900     n = 0;
1901     for (i = 0; i < dd->nnodes; i++)
1902     {
1903         p0 = ddindex2pmeindex(dd, i);
1904         p1 = ddindex2pmeindex(dd, i+1);
1905         if (i+1 == dd->nnodes || p1 > p0)
1906         {
1907             if (debug)
1908             {
1909                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1910             }
1911             pme_rank[n] = i + 1 + n;
1912             n++;
1913         }
1914     }
1915
1916     return pme_rank;
1917 }
1918
1919 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1920 {
1921     gmx_domdec_t *dd;
1922     ivec          coords;
1923     int           slab;
1924
1925     dd = cr->dd;
1926     /*
1927        if (dd->comm->bCartesian) {
1928        gmx_ddindex2xyz(dd->nc,ddindex,coords);
1929        dd_coords2pmecoords(dd,coords,coords_pme);
1930        copy_ivec(dd->ntot,nc);
1931        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
1932        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1933
1934        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1935        } else {
1936        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1937        }
1938      */
1939     coords[XX] = x;
1940     coords[YY] = y;
1941     coords[ZZ] = z;
1942     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1943
1944     return slab;
1945 }
1946
1947 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1948 {
1949     gmx_domdec_comm_t *comm;
1950     ivec               coords;
1951     int                ddindex, nodeid = -1;
1952
1953     comm = cr->dd->comm;
1954
1955     coords[XX] = x;
1956     coords[YY] = y;
1957     coords[ZZ] = z;
1958     if (comm->bCartesianPP_PME)
1959     {
1960 #if GMX_MPI
1961         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1962 #endif
1963     }
1964     else
1965     {
1966         ddindex = dd_index(cr->dd->nc, coords);
1967         if (comm->bCartesianPP)
1968         {
1969             nodeid = comm->ddindex2simnodeid[ddindex];
1970         }
1971         else
1972         {
1973             if (comm->pmenodes)
1974             {
1975                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1976             }
1977             else
1978             {
1979                 nodeid = ddindex;
1980             }
1981         }
1982     }
1983
1984     return nodeid;
1985 }
1986
1987 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
1988                               const t_commrec gmx_unused *cr,
1989                               int                         sim_nodeid)
1990 {
1991     int pmenode = -1;
1992
1993     const gmx_domdec_comm_t *comm = dd->comm;
1994
1995     /* This assumes a uniform x domain decomposition grid cell size */
1996     if (comm->bCartesianPP_PME)
1997     {
1998 #if GMX_MPI
1999         ivec coord, coord_pme;
2000         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2001         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2002         {
2003             /* This is a PP node */
2004             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2005             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2006         }
2007 #endif
2008     }
2009     else if (comm->bCartesianPP)
2010     {
2011         if (sim_nodeid < dd->nnodes)
2012         {
2013             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2014         }
2015     }
2016     else
2017     {
2018         /* This assumes DD cells with identical x coordinates
2019          * are numbered sequentially.
2020          */
2021         if (dd->comm->pmenodes == NULL)
2022         {
2023             if (sim_nodeid < dd->nnodes)
2024             {
2025                 /* The DD index equals the nodeid */
2026                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2027             }
2028         }
2029         else
2030         {
2031             int i = 0;
2032             while (sim_nodeid > dd->comm->pmenodes[i])
2033             {
2034                 i++;
2035             }
2036             if (sim_nodeid < dd->comm->pmenodes[i])
2037             {
2038                 pmenode = dd->comm->pmenodes[i];
2039             }
2040         }
2041     }
2042
2043     return pmenode;
2044 }
2045
2046 void get_pme_nnodes(const gmx_domdec_t *dd,
2047                     int *npmenodes_x, int *npmenodes_y)
2048 {
2049     if (dd != NULL)
2050     {
2051         *npmenodes_x = dd->comm->npmenodes_x;
2052         *npmenodes_y = dd->comm->npmenodes_y;
2053     }
2054     else
2055     {
2056         *npmenodes_x = 1;
2057         *npmenodes_y = 1;
2058     }
2059 }
2060
2061 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2062                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2063 {
2064     gmx_domdec_t *dd;
2065     int           x, y, z;
2066     ivec          coord, coord_pme;
2067
2068     dd = cr->dd;
2069
2070     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2071
2072     *nmy_ddnodes = 0;
2073     for (x = 0; x < dd->nc[XX]; x++)
2074     {
2075         for (y = 0; y < dd->nc[YY]; y++)
2076         {
2077             for (z = 0; z < dd->nc[ZZ]; z++)
2078             {
2079                 if (dd->comm->bCartesianPP_PME)
2080                 {
2081                     coord[XX] = x;
2082                     coord[YY] = y;
2083                     coord[ZZ] = z;
2084                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2085                     if (dd->ci[XX] == coord_pme[XX] &&
2086                         dd->ci[YY] == coord_pme[YY] &&
2087                         dd->ci[ZZ] == coord_pme[ZZ])
2088                     {
2089                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2090                     }
2091                 }
2092                 else
2093                 {
2094                     /* The slab corresponds to the nodeid in the PME group */
2095                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2096                     {
2097                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2098                     }
2099                 }
2100             }
2101         }
2102     }
2103
2104     /* The last PP-only node is the peer node */
2105     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2106
2107     if (debug)
2108     {
2109         fprintf(debug, "Receive coordinates from PP ranks:");
2110         for (x = 0; x < *nmy_ddnodes; x++)
2111         {
2112             fprintf(debug, " %d", (*my_ddnodes)[x]);
2113         }
2114         fprintf(debug, "\n");
2115     }
2116 }
2117
2118 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2119 {
2120     gmx_bool bReceive = TRUE;
2121
2122     if (cr->npmenodes < dd->nnodes)
2123     {
2124         gmx_domdec_comm_t *comm = dd->comm;
2125         if (comm->bCartesianPP_PME)
2126         {
2127 #if GMX_MPI
2128             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2129             ivec coords;
2130             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2131             coords[comm->cartpmedim]++;
2132             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2133             {
2134                 int rank;
2135                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2136                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2137                 {
2138                     /* This is not the last PP node for pmenode */
2139                     bReceive = FALSE;
2140                 }
2141             }
2142 #else
2143             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2144 #endif
2145         }
2146         else
2147         {
2148             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2149             if (cr->sim_nodeid+1 < cr->nnodes &&
2150                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2151             {
2152                 /* This is not the last PP node for pmenode */
2153                 bReceive = FALSE;
2154             }
2155         }
2156     }
2157
2158     return bReceive;
2159 }
2160
2161 static void set_zones_ncg_home(gmx_domdec_t *dd)
2162 {
2163     gmx_domdec_zones_t *zones;
2164     int                 i;
2165
2166     zones = &dd->comm->zones;
2167
2168     zones->cg_range[0] = 0;
2169     for (i = 1; i < zones->n+1; i++)
2170     {
2171         zones->cg_range[i] = dd->ncg_home;
2172     }
2173     /* zone_ncg1[0] should always be equal to ncg_home */
2174     dd->comm->zone_ncg1[0] = dd->ncg_home;
2175 }
2176
2177 static void rebuild_cgindex(gmx_domdec_t *dd,
2178                             const int *gcgs_index, const t_state *state)
2179 {
2180     int * gmx_restrict dd_cg_gl = dd->index_gl;
2181     int * gmx_restrict cgindex  = dd->cgindex;
2182     int                nat      = 0;
2183
2184     /* Copy back the global charge group indices from state
2185      * and rebuild the local charge group to atom index.
2186      */
2187     cgindex[0] = nat;
2188     for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2189     {
2190         cgindex[i]  = nat;
2191         int cg_gl   = state->cg_gl[i];
2192         dd_cg_gl[i] = cg_gl;
2193         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2194     }
2195     cgindex[state->cg_gl.size()] = nat;
2196
2197     dd->ncg_home = state->cg_gl.size();
2198     dd->nat_home = nat;
2199
2200     set_zones_ncg_home(dd);
2201 }
2202
2203 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2204 {
2205     while (cg >= cginfo_mb->cg_end)
2206     {
2207         cginfo_mb++;
2208     }
2209
2210     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2211 }
2212
2213 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2214                           t_forcerec *fr, char *bLocalCG)
2215 {
2216     cginfo_mb_t *cginfo_mb;
2217     int         *cginfo;
2218     int          cg;
2219
2220     if (fr != NULL)
2221     {
2222         cginfo_mb = fr->cginfo_mb;
2223         cginfo    = fr->cginfo;
2224
2225         for (cg = cg0; cg < cg1; cg++)
2226         {
2227             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2228         }
2229     }
2230
2231     if (bLocalCG != NULL)
2232     {
2233         for (cg = cg0; cg < cg1; cg++)
2234         {
2235             bLocalCG[index_gl[cg]] = TRUE;
2236         }
2237     }
2238 }
2239
2240 static void make_dd_indices(gmx_domdec_t *dd,
2241                             const int *gcgs_index, int cg_start)
2242 {
2243     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2244     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2245     gmx_bool     bCGs;
2246
2247     if (dd->nat_tot > dd->gatindex_nalloc)
2248     {
2249         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2250         srenew(dd->gatindex, dd->gatindex_nalloc);
2251     }
2252
2253     nzone      = dd->comm->zones.n;
2254     zone2cg    = dd->comm->zones.cg_range;
2255     zone_ncg1  = dd->comm->zone_ncg1;
2256     index_gl   = dd->index_gl;
2257     gatindex   = dd->gatindex;
2258     bCGs       = dd->comm->bCGs;
2259
2260     if (zone2cg[1] != dd->ncg_home)
2261     {
2262         gmx_incons("dd->ncg_zone is not up to date");
2263     }
2264
2265     /* Make the local to global and global to local atom index */
2266     a = dd->cgindex[cg_start];
2267     for (zone = 0; zone < nzone; zone++)
2268     {
2269         if (zone == 0)
2270         {
2271             cg0 = cg_start;
2272         }
2273         else
2274         {
2275             cg0 = zone2cg[zone];
2276         }
2277         cg1    = zone2cg[zone+1];
2278         cg1_p1 = cg0 + zone_ncg1[zone];
2279
2280         for (cg = cg0; cg < cg1; cg++)
2281         {
2282             zone1 = zone;
2283             if (cg >= cg1_p1)
2284             {
2285                 /* Signal that this cg is from more than one pulse away */
2286                 zone1 += nzone;
2287             }
2288             cg_gl = index_gl[cg];
2289             if (bCGs)
2290             {
2291                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2292                 {
2293                     gatindex[a] = a_gl;
2294                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2295                     a++;
2296                 }
2297             }
2298             else
2299             {
2300                 gatindex[a] = cg_gl;
2301                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2302                 a++;
2303             }
2304         }
2305     }
2306 }
2307
2308 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2309                           const char *where)
2310 {
2311     int i, ngl, nerr;
2312
2313     nerr = 0;
2314     if (bLocalCG == NULL)
2315     {
2316         return nerr;
2317     }
2318     for (i = 0; i < dd->ncg_tot; i++)
2319     {
2320         if (!bLocalCG[dd->index_gl[i]])
2321         {
2322             fprintf(stderr,
2323                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2324             nerr++;
2325         }
2326     }
2327     ngl = 0;
2328     for (i = 0; i < ncg_sys; i++)
2329     {
2330         if (bLocalCG[i])
2331         {
2332             ngl++;
2333         }
2334     }
2335     if (ngl != dd->ncg_tot)
2336     {
2337         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2338         nerr++;
2339     }
2340
2341     return nerr;
2342 }
2343
2344 static void check_index_consistency(gmx_domdec_t *dd,
2345                                     int natoms_sys, int ncg_sys,
2346                                     const char *where)
2347 {
2348     int   nerr, ngl, i, a, cell;
2349     int  *have;
2350
2351     nerr = 0;
2352
2353     if (dd->comm->DD_debug > 1)
2354     {
2355         snew(have, natoms_sys);
2356         for (a = 0; a < dd->nat_tot; a++)
2357         {
2358             if (have[dd->gatindex[a]] > 0)
2359             {
2360                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2361             }
2362             else
2363             {
2364                 have[dd->gatindex[a]] = a + 1;
2365             }
2366         }
2367         sfree(have);
2368     }
2369
2370     snew(have, dd->nat_tot);
2371
2372     ngl  = 0;
2373     for (i = 0; i < natoms_sys; i++)
2374     {
2375         if (ga2la_get(dd->ga2la, i, &a, &cell))
2376         {
2377             if (a >= dd->nat_tot)
2378             {
2379                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2380                 nerr++;
2381             }
2382             else
2383             {
2384                 have[a] = 1;
2385                 if (dd->gatindex[a] != i)
2386                 {
2387                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2388                     nerr++;
2389                 }
2390             }
2391             ngl++;
2392         }
2393     }
2394     if (ngl != dd->nat_tot)
2395     {
2396         fprintf(stderr,
2397                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2398                 dd->rank, where, ngl, dd->nat_tot);
2399     }
2400     for (a = 0; a < dd->nat_tot; a++)
2401     {
2402         if (have[a] == 0)
2403         {
2404             fprintf(stderr,
2405                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2406                     dd->rank, where, a+1, dd->gatindex[a]+1);
2407         }
2408     }
2409     sfree(have);
2410
2411     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2412
2413     if (nerr > 0)
2414     {
2415         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2416                   dd->rank, where, nerr);
2417     }
2418 }
2419
2420 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2421 {
2422     int   i;
2423     char *bLocalCG;
2424
2425     if (a_start == 0)
2426     {
2427         /* Clear the whole list without searching */
2428         ga2la_clear(dd->ga2la);
2429     }
2430     else
2431     {
2432         for (i = a_start; i < dd->nat_tot; i++)
2433         {
2434             ga2la_del(dd->ga2la, dd->gatindex[i]);
2435         }
2436     }
2437
2438     bLocalCG = dd->comm->bLocalCG;
2439     if (bLocalCG)
2440     {
2441         for (i = cg_start; i < dd->ncg_tot; i++)
2442         {
2443             bLocalCG[dd->index_gl[i]] = FALSE;
2444         }
2445     }
2446
2447     dd_clear_local_vsite_indices(dd);
2448
2449     if (dd->constraints)
2450     {
2451         dd_clear_local_constraint_indices(dd);
2452     }
2453 }
2454
2455 /* This function should be used for moving the domain boudaries during DLB,
2456  * for obtaining the minimum cell size. It checks the initially set limit
2457  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2458  * and, possibly, a longer cut-off limit set for PME load balancing.
2459  */
2460 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2461 {
2462     real cellsize_min;
2463
2464     cellsize_min = comm->cellsize_min[dim];
2465
2466     if (!comm->bVacDLBNoLimit)
2467     {
2468         /* The cut-off might have changed, e.g. by PME load balacning,
2469          * from the value used to set comm->cellsize_min, so check it.
2470          */
2471         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2472
2473         if (comm->bPMELoadBalDLBLimits)
2474         {
2475             /* Check for the cut-off limit set by the PME load balancing */
2476             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2477         }
2478     }
2479
2480     return cellsize_min;
2481 }
2482
2483 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2484                             int dim_ind)
2485 {
2486     real grid_jump_limit;
2487
2488     /* The distance between the boundaries of cells at distance
2489      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2490      * and by the fact that cells should not be shifted by more than
2491      * half their size, such that cg's only shift by one cell
2492      * at redecomposition.
2493      */
2494     grid_jump_limit = comm->cellsize_limit;
2495     if (!comm->bVacDLBNoLimit)
2496     {
2497         if (comm->bPMELoadBalDLBLimits)
2498         {
2499             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2500         }
2501         grid_jump_limit = std::max(grid_jump_limit,
2502                                    cutoff/comm->cd[dim_ind].np);
2503     }
2504
2505     return grid_jump_limit;
2506 }
2507
2508 static gmx_bool check_grid_jump(gmx_int64_t     step,
2509                                 gmx_domdec_t   *dd,
2510                                 real            cutoff,
2511                                 gmx_ddbox_t    *ddbox,
2512                                 gmx_bool        bFatal)
2513 {
2514     gmx_domdec_comm_t *comm;
2515     int                d, dim;
2516     real               limit, bfac;
2517     gmx_bool           bInvalid;
2518
2519     bInvalid = FALSE;
2520
2521     comm = dd->comm;
2522
2523     for (d = 1; d < dd->ndim; d++)
2524     {
2525         dim   = dd->dim[d];
2526         limit = grid_jump_limit(comm, cutoff, d);
2527         bfac  = ddbox->box_size[dim];
2528         if (ddbox->tric_dir[dim])
2529         {
2530             bfac *= ddbox->skew_fac[dim];
2531         }
2532         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2533                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2534         {
2535             bInvalid = TRUE;
2536
2537             if (bFatal)
2538             {
2539                 char buf[22];
2540
2541                 /* This error should never be triggered under normal
2542                  * circumstances, but you never know ...
2543                  */
2544                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2545                           gmx_step_str(step, buf),
2546                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2547             }
2548         }
2549     }
2550
2551     return bInvalid;
2552 }
2553
2554 static int dd_load_count(gmx_domdec_comm_t *comm)
2555 {
2556     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2557 }
2558
2559 static float dd_force_load(gmx_domdec_comm_t *comm)
2560 {
2561     float load;
2562
2563     if (comm->eFlop)
2564     {
2565         load = comm->flop;
2566         if (comm->eFlop > 1)
2567         {
2568             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2569         }
2570     }
2571     else
2572     {
2573         load = comm->cycl[ddCyclF];
2574         if (comm->cycl_n[ddCyclF] > 1)
2575         {
2576             /* Subtract the maximum of the last n cycle counts
2577              * to get rid of possible high counts due to other sources,
2578              * for instance system activity, that would otherwise
2579              * affect the dynamic load balancing.
2580              */
2581             load -= comm->cycl_max[ddCyclF];
2582         }
2583
2584 #if GMX_MPI
2585         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2586         {
2587             float gpu_wait, gpu_wait_sum;
2588
2589             gpu_wait = comm->cycl[ddCyclWaitGPU];
2590             if (comm->cycl_n[ddCyclF] > 1)
2591             {
2592                 /* We should remove the WaitGPU time of the same MD step
2593                  * as the one with the maximum F time, since the F time
2594                  * and the wait time are not independent.
2595                  * Furthermore, the step for the max F time should be chosen
2596                  * the same on all ranks that share the same GPU.
2597                  * But to keep the code simple, we remove the average instead.
2598                  * The main reason for artificially long times at some steps
2599                  * is spurious CPU activity or MPI time, so we don't expect
2600                  * that changes in the GPU wait time matter a lot here.
2601                  */
2602                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2603             }
2604             /* Sum the wait times over the ranks that share the same GPU */
2605             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2606                           comm->mpi_comm_gpu_shared);
2607             /* Replace the wait time by the average over the ranks */
2608             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2609         }
2610 #endif
2611     }
2612
2613     return load;
2614 }
2615
2616 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2617 {
2618     gmx_domdec_comm_t *comm;
2619     int                i;
2620
2621     comm = dd->comm;
2622
2623     snew(*dim_f, dd->nc[dim]+1);
2624     (*dim_f)[0] = 0;
2625     for (i = 1; i < dd->nc[dim]; i++)
2626     {
2627         if (comm->slb_frac[dim])
2628         {
2629             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2630         }
2631         else
2632         {
2633             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2634         }
2635     }
2636     (*dim_f)[dd->nc[dim]] = 1;
2637 }
2638
2639 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2640 {
2641     int  pmeindex, slab, nso, i;
2642     ivec xyz;
2643
2644     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2645     {
2646         ddpme->dim = YY;
2647     }
2648     else
2649     {
2650         ddpme->dim = dimind;
2651     }
2652     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2653
2654     ddpme->nslab = (ddpme->dim == 0 ?
2655                     dd->comm->npmenodes_x :
2656                     dd->comm->npmenodes_y);
2657
2658     if (ddpme->nslab <= 1)
2659     {
2660         return;
2661     }
2662
2663     nso = dd->comm->npmenodes/ddpme->nslab;
2664     /* Determine for each PME slab the PP location range for dimension dim */
2665     snew(ddpme->pp_min, ddpme->nslab);
2666     snew(ddpme->pp_max, ddpme->nslab);
2667     for (slab = 0; slab < ddpme->nslab; slab++)
2668     {
2669         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2670         ddpme->pp_max[slab] = 0;
2671     }
2672     for (i = 0; i < dd->nnodes; i++)
2673     {
2674         ddindex2xyz(dd->nc, i, xyz);
2675         /* For y only use our y/z slab.
2676          * This assumes that the PME x grid size matches the DD grid size.
2677          */
2678         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2679         {
2680             pmeindex = ddindex2pmeindex(dd, i);
2681             if (dimind == 0)
2682             {
2683                 slab = pmeindex/nso;
2684             }
2685             else
2686             {
2687                 slab = pmeindex % ddpme->nslab;
2688             }
2689             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2690             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2691         }
2692     }
2693
2694     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2695 }
2696
2697 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2698 {
2699     if (dd->comm->ddpme[0].dim == XX)
2700     {
2701         return dd->comm->ddpme[0].maxshift;
2702     }
2703     else
2704     {
2705         return 0;
2706     }
2707 }
2708
2709 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2710 {
2711     if (dd->comm->ddpme[0].dim == YY)
2712     {
2713         return dd->comm->ddpme[0].maxshift;
2714     }
2715     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2716     {
2717         return dd->comm->ddpme[1].maxshift;
2718     }
2719     else
2720     {
2721         return 0;
2722     }
2723 }
2724
2725 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2726                              gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2727                              const real *cell_f)
2728 {
2729     gmx_domdec_comm_t *comm;
2730     int                nc, ns, s;
2731     int               *xmin, *xmax;
2732     real               range, pme_boundary;
2733     int                sh;
2734
2735     comm = dd->comm;
2736     nc   = dd->nc[ddpme->dim];
2737     ns   = ddpme->nslab;
2738
2739     if (!ddpme->dim_match)
2740     {
2741         /* PP decomposition is not along dim: the worst situation */
2742         sh = ns/2;
2743     }
2744     else if (ns <= 3 || (bUniform && ns == nc))
2745     {
2746         /* The optimal situation */
2747         sh = 1;
2748     }
2749     else
2750     {
2751         /* We need to check for all pme nodes which nodes they
2752          * could possibly need to communicate with.
2753          */
2754         xmin = ddpme->pp_min;
2755         xmax = ddpme->pp_max;
2756         /* Allow for atoms to be maximally 2/3 times the cut-off
2757          * out of their DD cell. This is a reasonable balance between
2758          * between performance and support for most charge-group/cut-off
2759          * combinations.
2760          */
2761         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2762         /* Avoid extra communication when we are exactly at a boundary */
2763         range *= 0.999;
2764
2765         sh = 1;
2766         for (s = 0; s < ns; s++)
2767         {
2768             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2769             pme_boundary = (real)s/ns;
2770             while (sh+1 < ns &&
2771                    ((s-(sh+1) >= 0 &&
2772                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2773                     (s-(sh+1) <  0 &&
2774                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2775             {
2776                 sh++;
2777             }
2778             pme_boundary = (real)(s+1)/ns;
2779             while (sh+1 < ns &&
2780                    ((s+(sh+1) <  ns &&
2781                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2782                     (s+(sh+1) >= ns &&
2783                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2784             {
2785                 sh++;
2786             }
2787         }
2788     }
2789
2790     ddpme->maxshift = sh;
2791
2792     if (debug)
2793     {
2794         fprintf(debug, "PME slab communication range for dim %d is %d\n",
2795                 ddpme->dim, ddpme->maxshift);
2796     }
2797 }
2798
2799 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2800 {
2801     int d, dim;
2802
2803     for (d = 0; d < dd->ndim; d++)
2804     {
2805         dim = dd->dim[d];
2806         if (dim < ddbox->nboundeddim &&
2807             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2808             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2809         {
2810             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2811                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2812                       dd->nc[dim], dd->comm->cellsize_limit);
2813         }
2814     }
2815 }
2816
2817 enum {
2818     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2819 };
2820
2821 /* Set the domain boundaries. Use for static (or no) load balancing,
2822  * and also for the starting state for dynamic load balancing.
2823  * setmode determine if and where the boundaries are stored, use enum above.
2824  * Returns the number communication pulses in npulse.
2825  */
2826 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2827                                   int setmode, ivec npulse)
2828 {
2829     gmx_domdec_comm_t *comm;
2830     int                d, j;
2831     rvec               cellsize_min;
2832     real              *cell_x, cell_dx, cellsize;
2833
2834     comm = dd->comm;
2835
2836     for (d = 0; d < DIM; d++)
2837     {
2838         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2839         npulse[d]       = 1;
2840         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2841         {
2842             /* Uniform grid */
2843             cell_dx = ddbox->box_size[d]/dd->nc[d];
2844             switch (setmode)
2845             {
2846                 case setcellsizeslbMASTER:
2847                     for (j = 0; j < dd->nc[d]+1; j++)
2848                     {
2849                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2850                     }
2851                     break;
2852                 case setcellsizeslbLOCAL:
2853                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2854                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2855                     break;
2856                 default:
2857                     break;
2858             }
2859             cellsize = cell_dx*ddbox->skew_fac[d];
2860             while (cellsize*npulse[d] < comm->cutoff)
2861             {
2862                 npulse[d]++;
2863             }
2864             cellsize_min[d] = cellsize;
2865         }
2866         else
2867         {
2868             /* Statically load balanced grid */
2869             /* Also when we are not doing a master distribution we determine
2870              * all cell borders in a loop to obtain identical values
2871              * to the master distribution case and to determine npulse.
2872              */
2873             if (setmode == setcellsizeslbMASTER)
2874             {
2875                 cell_x = dd->ma->cell_x[d];
2876             }
2877             else
2878             {
2879                 snew(cell_x, dd->nc[d]+1);
2880             }
2881             cell_x[0] = ddbox->box0[d];
2882             for (j = 0; j < dd->nc[d]; j++)
2883             {
2884                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
2885                 cell_x[j+1] = cell_x[j] + cell_dx;
2886                 cellsize    = cell_dx*ddbox->skew_fac[d];
2887                 while (cellsize*npulse[d] < comm->cutoff &&
2888                        npulse[d] < dd->nc[d]-1)
2889                 {
2890                     npulse[d]++;
2891                 }
2892                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2893             }
2894             if (setmode == setcellsizeslbLOCAL)
2895             {
2896                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2897                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2898             }
2899             if (setmode != setcellsizeslbMASTER)
2900             {
2901                 sfree(cell_x);
2902             }
2903         }
2904         /* The following limitation is to avoid that a cell would receive
2905          * some of its own home charge groups back over the periodic boundary.
2906          * Double charge groups cause trouble with the global indices.
2907          */
2908         if (d < ddbox->npbcdim &&
2909             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2910         {
2911             char error_string[STRLEN];
2912
2913             sprintf(error_string,
2914                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2915                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2916                     comm->cutoff,
2917                     dd->nc[d], dd->nc[d],
2918                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2919
2920             if (setmode == setcellsizeslbLOCAL)
2921             {
2922                 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2923                                      error_string);
2924             }
2925             else
2926             {
2927                 gmx_fatal(FARGS, error_string);
2928             }
2929         }
2930     }
2931
2932     if (!dlbIsOn(comm))
2933     {
2934         copy_rvec(cellsize_min, comm->cellsize_min);
2935     }
2936
2937     for (d = 0; d < comm->npmedecompdim; d++)
2938     {
2939         set_pme_maxshift(dd, &comm->ddpme[d],
2940                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
2941                          comm->ddpme[d].slb_dim_f);
2942     }
2943 }
2944
2945
2946 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2947                                                   int d, int dim, domdec_root_t *root,
2948                                                   const gmx_ddbox_t *ddbox,
2949                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2950 {
2951     gmx_domdec_comm_t *comm;
2952     int                ncd, i, j, nmin, nmin_old;
2953     gmx_bool           bLimLo, bLimHi;
2954     real              *cell_size;
2955     real               fac, halfway, cellsize_limit_f_i, region_size;
2956     gmx_bool           bPBC, bLastHi = FALSE;
2957     int                nrange[] = {range[0], range[1]};
2958
2959     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2960
2961     comm = dd->comm;
2962
2963     ncd = dd->nc[dim];
2964
2965     bPBC = (dim < ddbox->npbcdim);
2966
2967     cell_size = root->buf_ncd;
2968
2969     if (debug)
2970     {
2971         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2972     }
2973
2974     /* First we need to check if the scaling does not make cells
2975      * smaller than the smallest allowed size.
2976      * We need to do this iteratively, since if a cell is too small,
2977      * it needs to be enlarged, which makes all the other cells smaller,
2978      * which could in turn make another cell smaller than allowed.
2979      */
2980     for (i = range[0]; i < range[1]; i++)
2981     {
2982         root->bCellMin[i] = FALSE;
2983     }
2984     nmin = 0;
2985     do
2986     {
2987         nmin_old = nmin;
2988         /* We need the total for normalization */
2989         fac = 0;
2990         for (i = range[0]; i < range[1]; i++)
2991         {
2992             if (root->bCellMin[i] == FALSE)
2993             {
2994                 fac += cell_size[i];
2995             }
2996         }
2997         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2998         /* Determine the cell boundaries */
2999         for (i = range[0]; i < range[1]; i++)
3000         {
3001             if (root->bCellMin[i] == FALSE)
3002             {
3003                 cell_size[i] *= fac;
3004                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3005                 {
3006                     cellsize_limit_f_i = 0;
3007                 }
3008                 else
3009                 {
3010                     cellsize_limit_f_i = cellsize_limit_f;
3011                 }
3012                 if (cell_size[i] < cellsize_limit_f_i)
3013                 {
3014                     root->bCellMin[i] = TRUE;
3015                     cell_size[i]      = cellsize_limit_f_i;
3016                     nmin++;
3017                 }
3018             }
3019             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3020         }
3021     }
3022     while (nmin > nmin_old);
3023
3024     i            = range[1]-1;
3025     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3026     /* For this check we should not use DD_CELL_MARGIN,
3027      * but a slightly smaller factor,
3028      * since rounding could get use below the limit.
3029      */
3030     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3031     {
3032         char buf[22];
3033         gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3034                   gmx_step_str(step, buf),
3035                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3036                   ncd, comm->cellsize_min[dim]);
3037     }
3038
3039     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3040
3041     if (!bUniform)
3042     {
3043         /* Check if the boundary did not displace more than halfway
3044          * each of the cells it bounds, as this could cause problems,
3045          * especially when the differences between cell sizes are large.
3046          * If changes are applied, they will not make cells smaller
3047          * than the cut-off, as we check all the boundaries which
3048          * might be affected by a change and if the old state was ok,
3049          * the cells will at most be shrunk back to their old size.
3050          */
3051         for (i = range[0]+1; i < range[1]; i++)
3052         {
3053             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3054             if (root->cell_f[i] < halfway)
3055             {
3056                 root->cell_f[i] = halfway;
3057                 /* Check if the change also causes shifts of the next boundaries */
3058                 for (j = i+1; j < range[1]; j++)
3059                 {
3060                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3061                     {
3062                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3063                     }
3064                 }
3065             }
3066             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3067             if (root->cell_f[i] > halfway)
3068             {
3069                 root->cell_f[i] = halfway;
3070                 /* Check if the change also causes shifts of the next boundaries */
3071                 for (j = i-1; j >= range[0]+1; j--)
3072                 {
3073                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3074                     {
3075                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3076                     }
3077                 }
3078             }
3079         }
3080     }
3081
3082     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3083     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3084      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3085      * for a and b nrange is used */
3086     if (d > 0)
3087     {
3088         /* Take care of the staggering of the cell boundaries */
3089         if (bUniform)
3090         {
3091             for (i = range[0]; i < range[1]; i++)
3092             {
3093                 root->cell_f_max0[i] = root->cell_f[i];
3094                 root->cell_f_min1[i] = root->cell_f[i+1];
3095             }
3096         }
3097         else
3098         {
3099             for (i = range[0]+1; i < range[1]; i++)
3100             {
3101                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3102                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3103                 if (bLimLo && bLimHi)
3104                 {
3105                     /* Both limits violated, try the best we can */
3106                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3107                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3108                     nrange[0]       = range[0];
3109                     nrange[1]       = i;
3110                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3111
3112                     nrange[0] = i;
3113                     nrange[1] = range[1];
3114                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3115
3116                     return;
3117                 }
3118                 else if (bLimLo)
3119                 {
3120                     /* root->cell_f[i] = root->bound_min[i]; */
3121                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3122                     bLastHi   = FALSE;
3123                 }
3124                 else if (bLimHi && !bLastHi)
3125                 {
3126                     bLastHi = TRUE;
3127                     if (nrange[1] < range[1])   /* found a LimLo before */
3128                     {
3129                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3130                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3131                         nrange[0] = nrange[1];
3132                     }
3133                     root->cell_f[i] = root->bound_max[i];
3134                     nrange[1]       = i;
3135                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3136                     nrange[0] = i;
3137                     nrange[1] = range[1];
3138                 }
3139             }
3140             if (nrange[1] < range[1])   /* found last a LimLo */
3141             {
3142                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3143                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3144                 nrange[0] = nrange[1];
3145                 nrange[1] = range[1];
3146                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3147             }
3148             else if (nrange[0] > range[0]) /* found at least one LimHi */
3149             {
3150                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3151             }
3152         }
3153     }
3154 }
3155
3156
3157 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3158                                        int d, int dim, domdec_root_t *root,
3159                                        const gmx_ddbox_t *ddbox,
3160                                        gmx_bool bDynamicBox,
3161                                        gmx_bool bUniform, gmx_int64_t step)
3162 {
3163     gmx_domdec_comm_t *comm;
3164     int                ncd, d1, i, pos;
3165     real              *cell_size;
3166     real               load_aver, load_i, imbalance, change, change_max, sc;
3167     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3168     real               change_limit;
3169     real               relax = 0.5;
3170     gmx_bool           bPBC;
3171     int                range[] = { 0, 0 };
3172
3173     comm = dd->comm;
3174
3175     /* Convert the maximum change from the input percentage to a fraction */
3176     change_limit = comm->dlb_scale_lim*0.01;
3177
3178     ncd = dd->nc[dim];
3179
3180     bPBC = (dim < ddbox->npbcdim);
3181
3182     cell_size = root->buf_ncd;
3183
3184     /* Store the original boundaries */
3185     for (i = 0; i < ncd+1; i++)
3186     {
3187         root->old_cell_f[i] = root->cell_f[i];
3188     }
3189     if (bUniform)
3190     {
3191         for (i = 0; i < ncd; i++)
3192         {
3193             cell_size[i] = 1.0/ncd;
3194         }
3195     }
3196     else if (dd_load_count(comm) > 0)
3197     {
3198         load_aver  = comm->load[d].sum_m/ncd;
3199         change_max = 0;
3200         for (i = 0; i < ncd; i++)
3201         {
3202             /* Determine the relative imbalance of cell i */
3203             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3204             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3205             /* Determine the change of the cell size using underrelaxation */
3206             change     = -relax*imbalance;
3207             change_max = std::max(change_max, std::max(change, -change));
3208         }
3209         /* Limit the amount of scaling.
3210          * We need to use the same rescaling for all cells in one row,
3211          * otherwise the load balancing might not converge.
3212          */
3213         sc = relax;
3214         if (change_max > change_limit)
3215         {
3216             sc *= change_limit/change_max;
3217         }
3218         for (i = 0; i < ncd; i++)
3219         {
3220             /* Determine the relative imbalance of cell i */
3221             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3222             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3223             /* Determine the change of the cell size using underrelaxation */
3224             change       = -sc*imbalance;
3225             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3226         }
3227     }
3228
3229     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3230     cellsize_limit_f *= DD_CELL_MARGIN;
3231     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3232     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3233     if (ddbox->tric_dir[dim])
3234     {
3235         cellsize_limit_f /= ddbox->skew_fac[dim];
3236         dist_min_f       /= ddbox->skew_fac[dim];
3237     }
3238     if (bDynamicBox && d > 0)
3239     {
3240         dist_min_f *= DD_PRES_SCALE_MARGIN;
3241     }
3242     if (d > 0 && !bUniform)
3243     {
3244         /* Make sure that the grid is not shifted too much */
3245         for (i = 1; i < ncd; i++)
3246         {
3247             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3248             {
3249                 gmx_incons("Inconsistent DD boundary staggering limits!");
3250             }
3251             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3252             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3253             if (space > 0)
3254             {
3255                 root->bound_min[i] += 0.5*space;
3256             }
3257             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3258             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3259             if (space < 0)
3260             {
3261                 root->bound_max[i] += 0.5*space;
3262             }
3263             if (debug)
3264             {
3265                 fprintf(debug,
3266                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3267                         d, i,
3268                         root->cell_f_max0[i-1] + dist_min_f,
3269                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3270                         root->cell_f_min1[i] - dist_min_f);
3271             }
3272         }
3273     }
3274     range[1]          = ncd;
3275     root->cell_f[0]   = 0;
3276     root->cell_f[ncd] = 1;
3277     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3278
3279
3280     /* After the checks above, the cells should obey the cut-off
3281      * restrictions, but it does not hurt to check.
3282      */
3283     for (i = 0; i < ncd; i++)
3284     {
3285         if (debug)
3286         {
3287             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3288                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3289         }
3290
3291         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3292             root->cell_f[i+1] - root->cell_f[i] <
3293             cellsize_limit_f/DD_CELL_MARGIN)
3294         {
3295             char buf[22];
3296             fprintf(stderr,
3297                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3298                     gmx_step_str(step, buf), dim2char(dim), i,
3299                     (root->cell_f[i+1] - root->cell_f[i])
3300                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3301         }
3302     }
3303
3304     pos = ncd + 1;
3305     /* Store the cell boundaries of the lower dimensions at the end */
3306     for (d1 = 0; d1 < d; d1++)
3307     {
3308         root->cell_f[pos++] = comm->cell_f0[d1];
3309         root->cell_f[pos++] = comm->cell_f1[d1];
3310     }
3311
3312     if (d < comm->npmedecompdim)
3313     {
3314         /* The master determines the maximum shift for
3315          * the coordinate communication between separate PME nodes.
3316          */
3317         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3318     }
3319     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3320     if (d >= 1)
3321     {
3322         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3323     }
3324 }
3325
3326 static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
3327                                              const gmx_ddbox_t *ddbox,
3328                                              int                dimind)
3329 {
3330     gmx_domdec_comm_t *comm;
3331     int                dim;
3332
3333     comm = dd->comm;
3334
3335     /* Set the cell dimensions */
3336     dim                = dd->dim[dimind];
3337     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3338     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3339     if (dim >= ddbox->nboundeddim)
3340     {
3341         comm->cell_x0[dim] += ddbox->box0[dim];
3342         comm->cell_x1[dim] += ddbox->box0[dim];
3343     }
3344 }
3345
3346 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3347                                          int d, int dim, real *cell_f_row,
3348                                          const gmx_ddbox_t *ddbox)
3349 {
3350     gmx_domdec_comm_t *comm;
3351     int                d1, pos;
3352
3353     comm = dd->comm;
3354
3355 #if GMX_MPI
3356     /* Each node would only need to know two fractions,
3357      * but it is probably cheaper to broadcast the whole array.
3358      */
3359     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3360               0, comm->mpi_comm_load[d]);
3361 #endif
3362     /* Copy the fractions for this dimension from the buffer */
3363     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3364     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3365     /* The whole array was communicated, so set the buffer position */
3366     pos = dd->nc[dim] + 1;
3367     for (d1 = 0; d1 <= d; d1++)
3368     {
3369         if (d1 < d)
3370         {
3371             /* Copy the cell fractions of the lower dimensions */
3372             comm->cell_f0[d1] = cell_f_row[pos++];
3373             comm->cell_f1[d1] = cell_f_row[pos++];
3374         }
3375         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3376     }
3377     /* Convert the communicated shift from float to int */
3378     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3379     if (d >= 1)
3380     {
3381         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3382     }
3383 }
3384
3385 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3386                                          const gmx_ddbox_t *ddbox,
3387                                          gmx_bool bDynamicBox,
3388                                          gmx_bool bUniform, gmx_int64_t step)
3389 {
3390     gmx_domdec_comm_t *comm;
3391     int                d, dim, d1;
3392     gmx_bool           bRowMember, bRowRoot;
3393     real              *cell_f_row;
3394
3395     comm = dd->comm;
3396
3397     for (d = 0; d < dd->ndim; d++)
3398     {
3399         dim        = dd->dim[d];
3400         bRowMember = TRUE;
3401         bRowRoot   = TRUE;
3402         for (d1 = d; d1 < dd->ndim; d1++)
3403         {
3404             if (dd->ci[dd->dim[d1]] > 0)
3405             {
3406                 if (d1 != d)
3407                 {
3408                     bRowMember = FALSE;
3409                 }
3410                 bRowRoot = FALSE;
3411             }
3412         }
3413         if (bRowMember)
3414         {
3415             if (bRowRoot)
3416             {
3417                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3418                                            ddbox, bDynamicBox, bUniform, step);
3419                 cell_f_row = comm->root[d]->cell_f;
3420             }
3421             else
3422             {
3423                 cell_f_row = comm->cell_f_row;
3424             }
3425             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3426         }
3427     }
3428 }
3429
3430 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
3431                                            const gmx_ddbox_t *ddbox)
3432 {
3433     int d;
3434
3435     /* This function assumes the box is static and should therefore
3436      * not be called when the box has changed since the last
3437      * call to dd_partition_system.
3438      */
3439     for (d = 0; d < dd->ndim; d++)
3440     {
3441         relative_to_absolute_cell_bounds(dd, ddbox, d);
3442     }
3443 }
3444
3445
3446
3447 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3448                                   const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3449                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3450                                   gmx_wallcycle_t wcycle)
3451 {
3452     gmx_domdec_comm_t *comm;
3453     int                dim;
3454
3455     comm = dd->comm;
3456
3457     if (bDoDLB)
3458     {
3459         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3460         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3461         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3462     }
3463     else if (bDynamicBox)
3464     {
3465         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3466     }
3467
3468     /* Set the dimensions for which no DD is used */
3469     for (dim = 0; dim < DIM; dim++)
3470     {
3471         if (dd->nc[dim] == 1)
3472         {
3473             comm->cell_x0[dim] = 0;
3474             comm->cell_x1[dim] = ddbox->box_size[dim];
3475             if (dim >= ddbox->nboundeddim)
3476             {
3477                 comm->cell_x0[dim] += ddbox->box0[dim];
3478                 comm->cell_x1[dim] += ddbox->box0[dim];
3479             }
3480         }
3481     }
3482 }
3483
3484 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3485 {
3486     int                    d, np, i;
3487     gmx_domdec_comm_dim_t *cd;
3488
3489     for (d = 0; d < dd->ndim; d++)
3490     {
3491         cd = &dd->comm->cd[d];
3492         np = npulse[dd->dim[d]];
3493         if (np > cd->np_nalloc)
3494         {
3495             if (debug)
3496             {
3497                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3498                         dim2char(dd->dim[d]), np);
3499             }
3500             if (DDMASTER(dd) && cd->np_nalloc > 0)
3501             {
3502                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3503             }
3504             srenew(cd->ind, np);
3505             for (i = cd->np_nalloc; i < np; i++)
3506             {
3507                 cd->ind[i].index  = NULL;
3508                 cd->ind[i].nalloc = 0;
3509             }
3510             cd->np_nalloc = np;
3511         }
3512         cd->np = np;
3513     }
3514 }
3515
3516
3517 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3518                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3519                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3520                               gmx_wallcycle_t wcycle)
3521 {
3522     gmx_domdec_comm_t *comm;
3523     int                d;
3524     ivec               npulse;
3525
3526     comm = dd->comm;
3527
3528     /* Copy the old cell boundaries for the cg displacement check */
3529     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3530     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3531
3532     if (dlbIsOn(comm))
3533     {
3534         if (DDMASTER(dd))
3535         {
3536             check_box_size(dd, ddbox);
3537         }
3538         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3539     }
3540     else
3541     {
3542         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3543         realloc_comm_ind(dd, npulse);
3544     }
3545
3546     if (debug)
3547     {
3548         for (d = 0; d < DIM; d++)
3549         {
3550             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3551                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3552         }
3553     }
3554 }
3555
3556 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3557                                   gmx_ddbox_t *ddbox,
3558                                   rvec cell_ns_x0, rvec cell_ns_x1,
3559                                   gmx_int64_t step)
3560 {
3561     gmx_domdec_comm_t *comm;
3562     int                dim_ind, dim;
3563
3564     comm = dd->comm;
3565
3566     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3567     {
3568         dim = dd->dim[dim_ind];
3569
3570         /* Without PBC we don't have restrictions on the outer cells */
3571         if (!(dim >= ddbox->npbcdim &&
3572               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3573             dlbIsOn(comm) &&
3574             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3575             comm->cellsize_min[dim])
3576         {
3577             char buf[22];
3578             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3579                       gmx_step_str(step, buf), dim2char(dim),
3580                       comm->cell_x1[dim] - comm->cell_x0[dim],
3581                       ddbox->skew_fac[dim],
3582                       dd->comm->cellsize_min[dim],
3583                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3584         }
3585     }
3586
3587     if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3588     {
3589         /* Communicate the boundaries and update cell_ns_x0/1 */
3590         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3591         if (dlbIsOn(dd->comm) && dd->ndim > 1)
3592         {
3593             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3594         }
3595     }
3596 }
3597
3598 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3599 {
3600     if (YY < npbcdim)
3601     {
3602         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3603     }
3604     else
3605     {
3606         tcm[YY][XX] = 0;
3607     }
3608     if (ZZ < npbcdim)
3609     {
3610         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3611         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3612     }
3613     else
3614     {
3615         tcm[ZZ][XX] = 0;
3616         tcm[ZZ][YY] = 0;
3617     }
3618 }
3619
3620 static void check_screw_box(matrix box)
3621 {
3622     /* Mathematical limitation */
3623     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3624     {
3625         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3626     }
3627
3628     /* Limitation due to the asymmetry of the eighth shell method */
3629     if (box[ZZ][YY] != 0)
3630     {
3631         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3632     }
3633 }
3634
3635 static void distribute_cg(FILE *fplog,
3636                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3637                           gmx_domdec_t *dd)
3638 {
3639     gmx_domdec_master_t *ma;
3640     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3641     int                  i, icg, j, k, k0, k1, d;
3642     matrix               tcm;
3643     rvec                 cg_cm;
3644     ivec                 ind;
3645     real                 nrcg, inv_ncg, pos_d;
3646     int                 *cgindex;
3647     gmx_bool             bScrew;
3648
3649     ma = dd->ma;
3650
3651     snew(tmp_nalloc, dd->nnodes);
3652     snew(tmp_ind, dd->nnodes);
3653     for (i = 0; i < dd->nnodes; i++)
3654     {
3655         tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3656         snew(tmp_ind[i], tmp_nalloc[i]);
3657     }
3658
3659     /* Clear the count */
3660     for (i = 0; i < dd->nnodes; i++)
3661     {
3662         ma->ncg[i] = 0;
3663         ma->nat[i] = 0;
3664     }
3665
3666     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3667
3668     cgindex = cgs->index;
3669
3670     /* Compute the center of geometry for all charge groups */
3671     for (icg = 0; icg < cgs->nr; icg++)
3672     {
3673         k0      = cgindex[icg];
3674         k1      = cgindex[icg+1];
3675         nrcg    = k1 - k0;
3676         if (nrcg == 1)
3677         {
3678             copy_rvec(pos[k0], cg_cm);
3679         }
3680         else
3681         {
3682             inv_ncg = 1.0/nrcg;
3683
3684             clear_rvec(cg_cm);
3685             for (k = k0; (k < k1); k++)
3686             {
3687                 rvec_inc(cg_cm, pos[k]);
3688             }
3689             for (d = 0; (d < DIM); d++)
3690             {
3691                 cg_cm[d] *= inv_ncg;
3692             }
3693         }
3694         /* Put the charge group in the box and determine the cell index */
3695         for (d = DIM-1; d >= 0; d--)
3696         {
3697             pos_d = cg_cm[d];
3698             if (d < dd->npbcdim)
3699             {
3700                 bScrew = (dd->bScrewPBC && d == XX);
3701                 if (tric_dir[d] && dd->nc[d] > 1)
3702                 {
3703                     /* Use triclinic coordintates for this dimension */
3704                     for (j = d+1; j < DIM; j++)
3705                     {
3706                         pos_d += cg_cm[j]*tcm[j][d];
3707                     }
3708                 }
3709                 while (pos_d >= box[d][d])
3710                 {
3711                     pos_d -= box[d][d];
3712                     rvec_dec(cg_cm, box[d]);
3713                     if (bScrew)
3714                     {
3715                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3716                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3717                     }
3718                     for (k = k0; (k < k1); k++)
3719                     {
3720                         rvec_dec(pos[k], box[d]);
3721                         if (bScrew)
3722                         {
3723                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3724                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3725                         }
3726                     }
3727                 }
3728                 while (pos_d < 0)
3729                 {
3730                     pos_d += box[d][d];
3731                     rvec_inc(cg_cm, box[d]);
3732                     if (bScrew)
3733                     {
3734                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3735                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3736                     }
3737                     for (k = k0; (k < k1); k++)
3738                     {
3739                         rvec_inc(pos[k], box[d]);
3740                         if (bScrew)
3741                         {
3742                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3743                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3744                         }
3745                     }
3746                 }
3747             }
3748             /* This could be done more efficiently */
3749             ind[d] = 0;
3750             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3751             {
3752                 ind[d]++;
3753             }
3754         }
3755         i = dd_index(dd->nc, ind);
3756         if (ma->ncg[i] == tmp_nalloc[i])
3757         {
3758             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3759             srenew(tmp_ind[i], tmp_nalloc[i]);
3760         }
3761         tmp_ind[i][ma->ncg[i]] = icg;
3762         ma->ncg[i]++;
3763         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3764     }
3765
3766     k1 = 0;
3767     for (i = 0; i < dd->nnodes; i++)
3768     {
3769         ma->index[i] = k1;
3770         for (k = 0; k < ma->ncg[i]; k++)
3771         {
3772             ma->cg[k1++] = tmp_ind[i][k];
3773         }
3774     }
3775     ma->index[dd->nnodes] = k1;
3776
3777     for (i = 0; i < dd->nnodes; i++)
3778     {
3779         sfree(tmp_ind[i]);
3780     }
3781     sfree(tmp_ind);
3782     sfree(tmp_nalloc);
3783
3784     if (fplog)
3785     {
3786         // Use double for the sums to avoid natoms^2 overflowing
3787         // (65537^2 > 2^32)
3788         int    nat_sum, nat_min, nat_max;
3789         double nat2_sum;
3790
3791         nat_sum  = 0;
3792         nat2_sum = 0;
3793         nat_min  = ma->nat[0];
3794         nat_max  = ma->nat[0];
3795         for (i = 0; i < dd->nnodes; i++)
3796         {
3797             nat_sum  += ma->nat[i];
3798             // cast to double to avoid integer overflows when squaring
3799             nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3800             nat_min   = std::min(nat_min, ma->nat[i]);
3801             nat_max   = std::max(nat_max, ma->nat[i]);
3802         }
3803         nat_sum  /= dd->nnodes;
3804         nat2_sum /= dd->nnodes;
3805
3806         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3807                 dd->nnodes,
3808                 nat_sum,
3809                 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3810                 nat_min, nat_max);
3811     }
3812 }
3813
3814 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3815                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3816                                 rvec pos[])
3817 {
3818     gmx_domdec_master_t *ma = NULL;
3819     ivec                 npulse;
3820     int                  i, cg_gl;
3821     int                 *ibuf, buf2[2] = { 0, 0 };
3822     gmx_bool             bMaster = DDMASTER(dd);
3823
3824     if (bMaster)
3825     {
3826         ma = dd->ma;
3827
3828         if (dd->bScrewPBC)
3829         {
3830             check_screw_box(box);
3831         }
3832
3833         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3834
3835         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3836         for (i = 0; i < dd->nnodes; i++)
3837         {
3838             ma->ibuf[2*i]   = ma->ncg[i];
3839             ma->ibuf[2*i+1] = ma->nat[i];
3840         }
3841         ibuf = ma->ibuf;
3842     }
3843     else
3844     {
3845         ibuf = NULL;
3846     }
3847     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3848
3849     dd->ncg_home = buf2[0];
3850     dd->nat_home = buf2[1];
3851     dd->ncg_tot  = dd->ncg_home;
3852     dd->nat_tot  = dd->nat_home;
3853     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3854     {
3855         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3856         srenew(dd->index_gl, dd->cg_nalloc);
3857         srenew(dd->cgindex, dd->cg_nalloc+1);
3858     }
3859     if (bMaster)
3860     {
3861         for (i = 0; i < dd->nnodes; i++)
3862         {
3863             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
3864             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3865         }
3866     }
3867
3868     dd_scatterv(dd,
3869                 bMaster ? ma->ibuf : NULL,
3870                 bMaster ? ma->ibuf+dd->nnodes : NULL,
3871                 bMaster ? ma->cg : NULL,
3872                 dd->ncg_home*sizeof(int), dd->index_gl);
3873
3874     /* Determine the home charge group sizes */
3875     dd->cgindex[0] = 0;
3876     for (i = 0; i < dd->ncg_home; i++)
3877     {
3878         cg_gl            = dd->index_gl[i];
3879         dd->cgindex[i+1] =
3880             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3881     }
3882
3883     if (debug)
3884     {
3885         fprintf(debug, "Home charge groups:\n");
3886         for (i = 0; i < dd->ncg_home; i++)
3887         {
3888             fprintf(debug, " %d", dd->index_gl[i]);
3889             if (i % 10 == 9)
3890             {
3891                 fprintf(debug, "\n");
3892             }
3893         }
3894         fprintf(debug, "\n");
3895     }
3896 }
3897
3898 static int compact_and_copy_vec_at(int ncg, int *move,
3899                                    int *cgindex,
3900                                    int nvec, int vec,
3901                                    rvec *src, gmx_domdec_comm_t *comm,
3902                                    gmx_bool bCompact)
3903 {
3904     int m, icg, i, i0, i1, nrcg;
3905     int home_pos;
3906     int pos_vec[DIM*2];
3907
3908     home_pos = 0;
3909
3910     for (m = 0; m < DIM*2; m++)
3911     {
3912         pos_vec[m] = 0;
3913     }
3914
3915     i0 = 0;
3916     for (icg = 0; icg < ncg; icg++)
3917     {
3918         i1 = cgindex[icg+1];
3919         m  = move[icg];
3920         if (m == -1)
3921         {
3922             if (bCompact)
3923             {
3924                 /* Compact the home array in place */
3925                 for (i = i0; i < i1; i++)
3926                 {
3927                     copy_rvec(src[i], src[home_pos++]);
3928                 }
3929             }
3930         }
3931         else
3932         {
3933             /* Copy to the communication buffer */
3934             nrcg        = i1 - i0;
3935             pos_vec[m] += 1 + vec*nrcg;
3936             for (i = i0; i < i1; i++)
3937             {
3938                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3939             }
3940             pos_vec[m] += (nvec - vec - 1)*nrcg;
3941         }
3942         if (!bCompact)
3943         {
3944             home_pos += i1 - i0;
3945         }
3946         i0 = i1;
3947     }
3948
3949     return home_pos;
3950 }
3951
3952 static int compact_and_copy_vec_cg(int ncg, int *move,
3953                                    int *cgindex,
3954                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
3955                                    gmx_bool bCompact)
3956 {
3957     int m, icg, i0, i1, nrcg;
3958     int home_pos;
3959     int pos_vec[DIM*2];
3960
3961     home_pos = 0;
3962
3963     for (m = 0; m < DIM*2; m++)
3964     {
3965         pos_vec[m] = 0;
3966     }
3967
3968     i0 = 0;
3969     for (icg = 0; icg < ncg; icg++)
3970     {
3971         i1 = cgindex[icg+1];
3972         m  = move[icg];
3973         if (m == -1)
3974         {
3975             if (bCompact)
3976             {
3977                 /* Compact the home array in place */
3978                 copy_rvec(src[icg], src[home_pos++]);
3979             }
3980         }
3981         else
3982         {
3983             nrcg = i1 - i0;
3984             /* Copy to the communication buffer */
3985             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3986             pos_vec[m] += 1 + nrcg*nvec;
3987         }
3988         i0 = i1;
3989     }
3990     if (!bCompact)
3991     {
3992         home_pos = ncg;
3993     }
3994
3995     return home_pos;
3996 }
3997
3998 static int compact_ind(int ncg, int *move,
3999                        int *index_gl, int *cgindex,
4000                        int *gatindex,
4001                        gmx_ga2la_t *ga2la, char *bLocalCG,
4002                        int *cginfo)
4003 {
4004     int cg, nat, a0, a1, a, a_gl;
4005     int home_pos;
4006
4007     home_pos = 0;
4008     nat      = 0;
4009     for (cg = 0; cg < ncg; cg++)
4010     {
4011         a0 = cgindex[cg];
4012         a1 = cgindex[cg+1];
4013         if (move[cg] == -1)
4014         {
4015             /* Compact the home arrays in place.
4016              * Anything that can be done here avoids access to global arrays.
4017              */
4018             cgindex[home_pos] = nat;
4019             for (a = a0; a < a1; a++)
4020             {
4021                 a_gl          = gatindex[a];
4022                 gatindex[nat] = a_gl;
4023                 /* The cell number stays 0, so we don't need to set it */
4024                 ga2la_change_la(ga2la, a_gl, nat);
4025                 nat++;
4026             }
4027             index_gl[home_pos] = index_gl[cg];
4028             cginfo[home_pos]   = cginfo[cg];
4029             /* The charge group remains local, so bLocalCG does not change */
4030             home_pos++;
4031         }
4032         else
4033         {
4034             /* Clear the global indices */
4035             for (a = a0; a < a1; a++)
4036             {
4037                 ga2la_del(ga2la, gatindex[a]);
4038             }
4039             if (bLocalCG)
4040             {
4041                 bLocalCG[index_gl[cg]] = FALSE;
4042             }
4043         }
4044     }
4045     cgindex[home_pos] = nat;
4046
4047     return home_pos;
4048 }
4049
4050 static void clear_and_mark_ind(int ncg, int *move,
4051                                int *index_gl, int *cgindex, int *gatindex,
4052                                gmx_ga2la_t *ga2la, char *bLocalCG,
4053                                int *cell_index)
4054 {
4055     int cg, a0, a1, a;
4056
4057     for (cg = 0; cg < ncg; cg++)
4058     {
4059         if (move[cg] >= 0)
4060         {
4061             a0 = cgindex[cg];
4062             a1 = cgindex[cg+1];
4063             /* Clear the global indices */
4064             for (a = a0; a < a1; a++)
4065             {
4066                 ga2la_del(ga2la, gatindex[a]);
4067             }
4068             if (bLocalCG)
4069             {
4070                 bLocalCG[index_gl[cg]] = FALSE;
4071             }
4072             /* Signal that this cg has moved using the ns cell index.
4073              * Here we set it to -1. fill_grid will change it
4074              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4075              */
4076             cell_index[cg] = -1;
4077         }
4078     }
4079 }
4080
4081 static void print_cg_move(FILE *fplog,
4082                           gmx_domdec_t *dd,
4083                           gmx_int64_t step, int cg, int dim, int dir,
4084                           gmx_bool bHaveCgcmOld, real limitd,
4085                           rvec cm_old, rvec cm_new, real pos_d)
4086 {
4087     gmx_domdec_comm_t *comm;
4088     char               buf[22];
4089
4090     comm = dd->comm;
4091
4092     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4093     if (limitd > 0)
4094     {
4095         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4096                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4097                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4098     }
4099     else
4100     {
4101         /* We don't have a limiting distance available: don't print it */
4102         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4103                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4104                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4105     }
4106     fprintf(fplog, "distance out of cell %f\n",
4107             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4108     if (bHaveCgcmOld)
4109     {
4110         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4111                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4112     }
4113     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4114             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4115     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4116             dim2char(dim),
4117             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4118     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4119             dim2char(dim),
4120             comm->cell_x0[dim], comm->cell_x1[dim]);
4121 }
4122
4123 static void cg_move_error(FILE *fplog,
4124                           gmx_domdec_t *dd,
4125                           gmx_int64_t step, int cg, int dim, int dir,
4126                           gmx_bool bHaveCgcmOld, real limitd,
4127                           rvec cm_old, rvec cm_new, real pos_d)
4128 {
4129     if (fplog)
4130     {
4131         print_cg_move(fplog, dd, step, cg, dim, dir,
4132                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4133     }
4134     print_cg_move(stderr, dd, step, cg, dim, dir,
4135                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4136     gmx_fatal(FARGS,
4137               "%s moved too far between two domain decomposition steps\n"
4138               "This usually means that your system is not well equilibrated",
4139               dd->comm->bCGs ? "A charge group" : "An atom");
4140 }
4141
4142 static void rotate_state_atom(t_state *state, int a)
4143 {
4144     int est;
4145
4146     for (est = 0; est < estNR; est++)
4147     {
4148         if (EST_DISTR(est) && (state->flags & (1<<est)))
4149         {
4150             switch (est)
4151             {
4152                 case estX:
4153                     /* Rotate the complete state; for a rectangular box only */
4154                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4155                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4156                     break;
4157                 case estV:
4158                     state->v[a][YY] = -state->v[a][YY];
4159                     state->v[a][ZZ] = -state->v[a][ZZ];
4160                     break;
4161                 case est_SDX_NOTSUPPORTED:
4162                     break;
4163                 case estCGP:
4164                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4165                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4166                     break;
4167                 case estDISRE_INITF:
4168                 case estDISRE_RM3TAV:
4169                 case estORIRE_INITF:
4170                 case estORIRE_DTAV:
4171                     /* These are distances, so not affected by rotation */
4172                     break;
4173                 default:
4174                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4175             }
4176         }
4177     }
4178 }
4179
4180 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4181 {
4182     if (natoms > comm->moved_nalloc)
4183     {
4184         /* Contents should be preserved here */
4185         comm->moved_nalloc = over_alloc_dd(natoms);
4186         srenew(comm->moved, comm->moved_nalloc);
4187     }
4188
4189     return comm->moved;
4190 }
4191
4192 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4193                          gmx_domdec_t *dd,
4194                          t_state *state,
4195                          ivec tric_dir, matrix tcm,
4196                          rvec cell_x0, rvec cell_x1,
4197                          rvec limitd, rvec limit0, rvec limit1,
4198                          const int *cgindex,
4199                          int cg_start, int cg_end,
4200                          rvec *cg_cm,
4201                          int *move)
4202 {
4203     int      npbcdim;
4204     int      cg, k, k0, k1, d, dim, d2;
4205     int      mc, nrcg;
4206     int      flag;
4207     gmx_bool bScrew;
4208     ivec     dev;
4209     real     inv_ncg, pos_d;
4210     rvec     cm_new;
4211
4212     npbcdim = dd->npbcdim;
4213
4214     for (cg = cg_start; cg < cg_end; cg++)
4215     {
4216         k0   = cgindex[cg];
4217         k1   = cgindex[cg+1];
4218         nrcg = k1 - k0;
4219         if (nrcg == 1)
4220         {
4221             copy_rvec(state->x[k0], cm_new);
4222         }
4223         else
4224         {
4225             inv_ncg = 1.0/nrcg;
4226
4227             clear_rvec(cm_new);
4228             for (k = k0; (k < k1); k++)
4229             {
4230                 rvec_inc(cm_new, state->x[k]);
4231             }
4232             for (d = 0; (d < DIM); d++)
4233             {
4234                 cm_new[d] = inv_ncg*cm_new[d];
4235             }
4236         }
4237
4238         clear_ivec(dev);
4239         /* Do pbc and check DD cell boundary crossings */
4240         for (d = DIM-1; d >= 0; d--)
4241         {
4242             if (dd->nc[d] > 1)
4243             {
4244                 bScrew = (dd->bScrewPBC && d == XX);
4245                 /* Determine the location of this cg in lattice coordinates */
4246                 pos_d = cm_new[d];
4247                 if (tric_dir[d])
4248                 {
4249                     for (d2 = d+1; d2 < DIM; d2++)
4250                     {
4251                         pos_d += cm_new[d2]*tcm[d2][d];
4252                     }
4253                 }
4254                 /* Put the charge group in the triclinic unit-cell */
4255                 if (pos_d >= cell_x1[d])
4256                 {
4257                     if (pos_d >= limit1[d])
4258                     {
4259                         cg_move_error(fplog, dd, step, cg, d, 1,
4260                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4261                                       cg_cm[cg], cm_new, pos_d);
4262                     }
4263                     dev[d] = 1;
4264                     if (dd->ci[d] == dd->nc[d] - 1)
4265                     {
4266                         rvec_dec(cm_new, state->box[d]);
4267                         if (bScrew)
4268                         {
4269                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4270                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4271                         }
4272                         for (k = k0; (k < k1); k++)
4273                         {
4274                             rvec_dec(state->x[k], state->box[d]);
4275                             if (bScrew)
4276                             {
4277                                 rotate_state_atom(state, k);
4278                             }
4279                         }
4280                     }
4281                 }
4282                 else if (pos_d < cell_x0[d])
4283                 {
4284                     if (pos_d < limit0[d])
4285                     {
4286                         cg_move_error(fplog, dd, step, cg, d, -1,
4287                                       cg_cm != as_rvec_array(state->x.data()), limitd[d],
4288                                       cg_cm[cg], cm_new, pos_d);
4289                     }
4290                     dev[d] = -1;
4291                     if (dd->ci[d] == 0)
4292                     {
4293                         rvec_inc(cm_new, state->box[d]);
4294                         if (bScrew)
4295                         {
4296                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4297                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4298                         }
4299                         for (k = k0; (k < k1); k++)
4300                         {
4301                             rvec_inc(state->x[k], state->box[d]);
4302                             if (bScrew)
4303                             {
4304                                 rotate_state_atom(state, k);
4305                             }
4306                         }
4307                     }
4308                 }
4309             }
4310             else if (d < npbcdim)
4311             {
4312                 /* Put the charge group in the rectangular unit-cell */
4313                 while (cm_new[d] >= state->box[d][d])
4314                 {
4315                     rvec_dec(cm_new, state->box[d]);
4316                     for (k = k0; (k < k1); k++)
4317                     {
4318                         rvec_dec(state->x[k], state->box[d]);
4319                     }
4320                 }
4321                 while (cm_new[d] < 0)
4322                 {
4323                     rvec_inc(cm_new, state->box[d]);
4324                     for (k = k0; (k < k1); k++)
4325                     {
4326                         rvec_inc(state->x[k], state->box[d]);
4327                     }
4328                 }
4329             }
4330         }
4331
4332         copy_rvec(cm_new, cg_cm[cg]);
4333
4334         /* Determine where this cg should go */
4335         flag = 0;
4336         mc   = -1;
4337         for (d = 0; d < dd->ndim; d++)
4338         {
4339             dim = dd->dim[d];
4340             if (dev[dim] == 1)
4341             {
4342                 flag |= DD_FLAG_FW(d);
4343                 if (mc == -1)
4344                 {
4345                     mc = d*2;
4346                 }
4347             }
4348             else if (dev[dim] == -1)
4349             {
4350                 flag |= DD_FLAG_BW(d);
4351                 if (mc == -1)
4352                 {
4353                     if (dd->nc[dim] > 2)
4354                     {
4355                         mc = d*2 + 1;
4356                     }
4357                     else
4358                     {
4359                         mc = d*2;
4360                     }
4361                 }
4362             }
4363         }
4364         /* Temporarily store the flag in move */
4365         move[cg] = mc + flag;
4366     }
4367 }
4368
4369 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4370                                gmx_domdec_t *dd, ivec tric_dir,
4371                                t_state *state, PaddedRVecVector *f,
4372                                t_forcerec *fr,
4373                                gmx_bool bCompact,
4374                                t_nrnb *nrnb,
4375                                int *ncg_stay_home,
4376                                int *ncg_moved)
4377 {
4378     int               *move;
4379     int                npbcdim;
4380     int                ncg[DIM*2], nat[DIM*2];
4381     int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
4382     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4383     int                sbuf[2], rbuf[2];
4384     int                home_pos_cg, home_pos_at, buf_pos;
4385     int                flag;
4386     gmx_bool           bV = FALSE, bCGP = FALSE;
4387     real               pos_d;
4388     matrix             tcm;
4389     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
4390     int               *cgindex;
4391     cginfo_mb_t       *cginfo_mb;
4392     gmx_domdec_comm_t *comm;
4393     int               *moved;
4394     int                nthread, thread;
4395
4396     if (dd->bScrewPBC)
4397     {
4398         check_screw_box(state->box);
4399     }
4400
4401     comm  = dd->comm;
4402     if (fr->cutoff_scheme == ecutsGROUP)
4403     {
4404         cg_cm = fr->cg_cm;
4405     }
4406
4407     for (i = 0; i < estNR; i++)
4408     {
4409         if (EST_DISTR(i))
4410         {
4411             switch (i)
4412             {
4413                 case estX: /* Always present */ break;
4414                 case estV:   bV   = (state->flags & (1<<i)); break;
4415                 case est_SDX_NOTSUPPORTED: break;
4416                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4417                 case estLD_RNG:
4418                 case estLD_RNGI:
4419                 case estDISRE_INITF:
4420                 case estDISRE_RM3TAV:
4421                 case estORIRE_INITF:
4422                 case estORIRE_DTAV:
4423                     /* No processing required */
4424                     break;
4425                 default:
4426                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4427             }
4428         }
4429     }
4430
4431     if (dd->ncg_tot > comm->nalloc_int)
4432     {
4433         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4434         srenew(comm->buf_int, comm->nalloc_int);
4435     }
4436     move = comm->buf_int;
4437
4438     /* Clear the count */
4439     for (c = 0; c < dd->ndim*2; c++)
4440     {
4441         ncg[c] = 0;
4442         nat[c] = 0;
4443     }
4444
4445     npbcdim = dd->npbcdim;
4446
4447     for (d = 0; (d < DIM); d++)
4448     {
4449         limitd[d] = dd->comm->cellsize_min[d];
4450         if (d >= npbcdim && dd->ci[d] == 0)
4451         {
4452             cell_x0[d] = -GMX_FLOAT_MAX;
4453         }
4454         else
4455         {
4456             cell_x0[d] = comm->cell_x0[d];
4457         }
4458         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4459         {
4460             cell_x1[d] = GMX_FLOAT_MAX;
4461         }
4462         else
4463         {
4464             cell_x1[d] = comm->cell_x1[d];
4465         }
4466         if (d < npbcdim)
4467         {
4468             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4469             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4470         }
4471         else
4472         {
4473             /* We check after communication if a charge group moved
4474              * more than one cell. Set the pre-comm check limit to float_max.
4475              */
4476             limit0[d] = -GMX_FLOAT_MAX;
4477             limit1[d] =  GMX_FLOAT_MAX;
4478         }
4479     }
4480
4481     make_tric_corr_matrix(npbcdim, state->box, tcm);
4482
4483     cgindex = dd->cgindex;
4484
4485     nthread = gmx_omp_nthreads_get(emntDomdec);
4486
4487     /* Compute the center of geometry for all home charge groups
4488      * and put them in the box and determine where they should go.
4489      */
4490 #pragma omp parallel for num_threads(nthread) schedule(static)
4491     for (thread = 0; thread < nthread; thread++)
4492     {
4493         try
4494         {
4495             calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4496                          cell_x0, cell_x1, limitd, limit0, limit1,
4497                          cgindex,
4498                          ( thread   *dd->ncg_home)/nthread,
4499                          ((thread+1)*dd->ncg_home)/nthread,
4500                          fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4501                          move);
4502         }
4503         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4504     }
4505
4506     for (cg = 0; cg < dd->ncg_home; cg++)
4507     {
4508         if (move[cg] >= 0)
4509         {
4510             mc       = move[cg];
4511             flag     = mc & ~DD_FLAG_NRCG;
4512             mc       = mc & DD_FLAG_NRCG;
4513             move[cg] = mc;
4514
4515             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4516             {
4517                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4518                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4519             }
4520             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4521             /* We store the cg size in the lower 16 bits
4522              * and the place where the charge group should go
4523              * in the next 6 bits. This saves some communication volume.
4524              */
4525             nrcg = cgindex[cg+1] - cgindex[cg];
4526             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4527             ncg[mc] += 1;
4528             nat[mc] += nrcg;
4529         }
4530     }
4531
4532     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4533     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4534
4535     *ncg_moved = 0;
4536     for (i = 0; i < dd->ndim*2; i++)
4537     {
4538         *ncg_moved += ncg[i];
4539     }
4540
4541     nvec = 1;
4542     if (bV)
4543     {
4544         nvec++;
4545     }
4546     if (bCGP)
4547     {
4548         nvec++;
4549     }
4550
4551     /* Make sure the communication buffers are large enough */
4552     for (mc = 0; mc < dd->ndim*2; mc++)
4553     {
4554         nvr = ncg[mc] + nat[mc]*nvec;
4555         if (nvr > comm->cgcm_state_nalloc[mc])
4556         {
4557             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4558             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4559         }
4560     }
4561
4562     switch (fr->cutoff_scheme)
4563     {
4564         case ecutsGROUP:
4565             /* Recalculating cg_cm might be cheaper than communicating,
4566              * but that could give rise to rounding issues.
4567              */
4568             home_pos_cg =
4569                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4570                                         nvec, cg_cm, comm, bCompact);
4571             break;
4572         case ecutsVERLET:
4573             /* Without charge groups we send the moved atom coordinates
4574              * over twice. This is so the code below can be used without
4575              * many conditionals for both for with and without charge groups.
4576              */
4577             home_pos_cg =
4578                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4579                                         nvec, as_rvec_array(state->x.data()), comm, FALSE);
4580             if (bCompact)
4581             {
4582                 home_pos_cg -= *ncg_moved;
4583             }
4584             break;
4585         default:
4586             gmx_incons("unimplemented");
4587             home_pos_cg = 0;
4588     }
4589
4590     vec         = 0;
4591     home_pos_at =
4592         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4593                                 nvec, vec++, as_rvec_array(state->x.data()),
4594                                 comm, bCompact);
4595     if (bV)
4596     {
4597         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4598                                 nvec, vec++, as_rvec_array(state->v.data()),
4599                                 comm, bCompact);
4600     }
4601     if (bCGP)
4602     {
4603         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4604                                 nvec, vec++, as_rvec_array(state->cg_p.data()),
4605                                 comm, bCompact);
4606     }
4607
4608     if (bCompact)
4609     {
4610         compact_ind(dd->ncg_home, move,
4611                     dd->index_gl, dd->cgindex, dd->gatindex,
4612                     dd->ga2la, comm->bLocalCG,
4613                     fr->cginfo);
4614     }
4615     else
4616     {
4617         if (fr->cutoff_scheme == ecutsVERLET)
4618         {
4619             moved = get_moved(comm, dd->ncg_home);
4620
4621             for (k = 0; k < dd->ncg_home; k++)
4622             {
4623                 moved[k] = 0;
4624             }
4625         }
4626         else
4627         {
4628             moved = fr->ns->grid->cell_index;
4629         }
4630
4631         clear_and_mark_ind(dd->ncg_home, move,
4632                            dd->index_gl, dd->cgindex, dd->gatindex,
4633                            dd->ga2la, comm->bLocalCG,
4634                            moved);
4635     }
4636
4637     cginfo_mb = fr->cginfo_mb;
4638
4639     *ncg_stay_home = home_pos_cg;
4640     for (d = 0; d < dd->ndim; d++)
4641     {
4642         dim      = dd->dim[d];
4643         ncg_recv = 0;
4644         nvr      = 0;
4645         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4646         {
4647             cdd = d*2 + dir;
4648             /* Communicate the cg and atom counts */
4649             sbuf[0] = ncg[cdd];
4650             sbuf[1] = nat[cdd];
4651             if (debug)
4652             {
4653                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4654                         d, dir, sbuf[0], sbuf[1]);
4655             }
4656             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4657
4658             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4659             {
4660                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4661                 srenew(comm->buf_int, comm->nalloc_int);
4662             }
4663
4664             /* Communicate the charge group indices, sizes and flags */
4665             dd_sendrecv_int(dd, d, dir,
4666                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4667                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4668
4669             nvs = ncg[cdd] + nat[cdd]*nvec;
4670             i   = rbuf[0]  + rbuf[1] *nvec;
4671             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4672
4673             /* Communicate cgcm and state */
4674             dd_sendrecv_rvec(dd, d, dir,
4675                              comm->cgcm_state[cdd], nvs,
4676                              comm->vbuf.v+nvr, i);
4677             ncg_recv += rbuf[0];
4678             nvr      += i;
4679         }
4680
4681         dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4682         if (fr->cutoff_scheme == ecutsGROUP)
4683         {
4684             /* Here we resize to more than necessary and shrink later */
4685             dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4686         }
4687
4688         /* Process the received charge groups */
4689         buf_pos = 0;
4690         for (cg = 0; cg < ncg_recv; cg++)
4691         {
4692             flag = comm->buf_int[cg*DD_CGIBS+1];
4693
4694             if (dim >= npbcdim && dd->nc[dim] > 2)
4695             {
4696                 /* No pbc in this dim and more than one domain boundary.
4697                  * We do a separate check if a charge group didn't move too far.
4698                  */
4699                 if (((flag & DD_FLAG_FW(d)) &&
4700                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4701                     ((flag & DD_FLAG_BW(d)) &&
4702                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4703                 {
4704                     cg_move_error(fplog, dd, step, cg, dim,
4705                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4706                                   fr->cutoff_scheme == ecutsGROUP, 0,
4707                                   comm->vbuf.v[buf_pos],
4708                                   comm->vbuf.v[buf_pos],
4709                                   comm->vbuf.v[buf_pos][dim]);
4710                 }
4711             }
4712
4713             mc = -1;
4714             if (d < dd->ndim-1)
4715             {
4716                 /* Check which direction this cg should go */
4717                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4718                 {
4719                     if (dlbIsOn(dd->comm))
4720                     {
4721                         /* The cell boundaries for dimension d2 are not equal
4722                          * for each cell row of the lower dimension(s),
4723                          * therefore we might need to redetermine where
4724                          * this cg should go.
4725                          */
4726                         dim2 = dd->dim[d2];
4727                         /* If this cg crosses the box boundary in dimension d2
4728                          * we can use the communicated flag, so we do not
4729                          * have to worry about pbc.
4730                          */
4731                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4732                                (flag & DD_FLAG_FW(d2))) ||
4733                               (dd->ci[dim2] == 0 &&
4734                                (flag & DD_FLAG_BW(d2)))))
4735                         {
4736                             /* Clear the two flags for this dimension */
4737                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4738                             /* Determine the location of this cg
4739                              * in lattice coordinates
4740                              */
4741                             pos_d = comm->vbuf.v[buf_pos][dim2];
4742                             if (tric_dir[dim2])
4743                             {
4744                                 for (d3 = dim2+1; d3 < DIM; d3++)
4745                                 {
4746                                     pos_d +=
4747                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4748                                 }
4749                             }
4750                             /* Check of we are not at the box edge.
4751                              * pbc is only handled in the first step above,
4752                              * but this check could move over pbc while
4753                              * the first step did not due to different rounding.
4754                              */
4755                             if (pos_d >= cell_x1[dim2] &&
4756                                 dd->ci[dim2] != dd->nc[dim2]-1)
4757                             {
4758                                 flag |= DD_FLAG_FW(d2);
4759                             }
4760                             else if (pos_d < cell_x0[dim2] &&
4761                                      dd->ci[dim2] != 0)
4762                             {
4763                                 flag |= DD_FLAG_BW(d2);
4764                             }
4765                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4766                         }
4767                     }
4768                     /* Set to which neighboring cell this cg should go */
4769                     if (flag & DD_FLAG_FW(d2))
4770                     {
4771                         mc = d2*2;
4772                     }
4773                     else if (flag & DD_FLAG_BW(d2))
4774                     {
4775                         if (dd->nc[dd->dim[d2]] > 2)
4776                         {
4777                             mc = d2*2+1;
4778                         }
4779                         else
4780                         {
4781                             mc = d2*2;
4782                         }
4783                     }
4784                 }
4785             }
4786
4787             nrcg = flag & DD_FLAG_NRCG;
4788             if (mc == -1)
4789             {
4790                 if (home_pos_cg+1 > dd->cg_nalloc)
4791                 {
4792                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4793                     srenew(dd->index_gl, dd->cg_nalloc);
4794                     srenew(dd->cgindex, dd->cg_nalloc+1);
4795                 }
4796                 /* Set the global charge group index and size */
4797                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4798                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4799                 /* Copy the state from the buffer */
4800                 if (fr->cutoff_scheme == ecutsGROUP)
4801                 {
4802                     cg_cm = fr->cg_cm;
4803                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4804                 }
4805                 buf_pos++;
4806
4807                 /* Set the cginfo */
4808                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4809                                                    dd->index_gl[home_pos_cg]);
4810                 if (comm->bLocalCG)
4811                 {
4812                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4813                 }
4814
4815                 for (i = 0; i < nrcg; i++)
4816                 {
4817                     copy_rvec(comm->vbuf.v[buf_pos++],
4818                               state->x[home_pos_at+i]);
4819                 }
4820                 if (bV)
4821                 {
4822                     for (i = 0; i < nrcg; i++)
4823                     {
4824                         copy_rvec(comm->vbuf.v[buf_pos++],
4825                                   state->v[home_pos_at+i]);
4826                     }
4827                 }
4828                 if (bCGP)
4829                 {
4830                     for (i = 0; i < nrcg; i++)
4831                     {
4832                         copy_rvec(comm->vbuf.v[buf_pos++],
4833                                   state->cg_p[home_pos_at+i]);
4834                     }
4835                 }
4836                 home_pos_cg += 1;
4837                 home_pos_at += nrcg;
4838             }
4839             else
4840             {
4841                 /* Reallocate the buffers if necessary  */
4842                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4843                 {
4844                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4845                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4846                 }
4847                 nvr = ncg[mc] + nat[mc]*nvec;
4848                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4849                 {
4850                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4851                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4852                 }
4853                 /* Copy from the receive to the send buffers */
4854                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4855                        comm->buf_int + cg*DD_CGIBS,
4856                        DD_CGIBS*sizeof(int));
4857                 memcpy(comm->cgcm_state[mc][nvr],
4858                        comm->vbuf.v[buf_pos],
4859                        (1+nrcg*nvec)*sizeof(rvec));
4860                 buf_pos += 1 + nrcg*nvec;
4861                 ncg[mc] += 1;
4862                 nat[mc] += nrcg;
4863             }
4864         }
4865     }
4866
4867     /* With sorting (!bCompact) the indices are now only partially up to date
4868      * and ncg_home and nat_home are not the real count, since there are
4869      * "holes" in the arrays for the charge groups that moved to neighbors.
4870      */
4871     if (fr->cutoff_scheme == ecutsVERLET)
4872     {
4873         moved = get_moved(comm, home_pos_cg);
4874
4875         for (i = dd->ncg_home; i < home_pos_cg; i++)
4876         {
4877             moved[i] = 0;
4878         }
4879     }
4880     dd->ncg_home = home_pos_cg;
4881     dd->nat_home = home_pos_at;
4882
4883     if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4884     {
4885         /* We overallocated before, we need to set the right size here */
4886         dd_resize_state(state, f, dd->nat_home);
4887     }
4888
4889     if (debug)
4890     {
4891         fprintf(debug,
4892                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4893                 *ncg_moved, dd->ncg_home-*ncg_moved);
4894
4895     }
4896 }
4897
4898 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4899 {
4900     /* Note that the cycles value can be incorrect, either 0 or some
4901      * extremely large value, when our thread migrated to another core
4902      * with an unsynchronized cycle counter. If this happens less often
4903      * that once per nstlist steps, this will not cause issues, since
4904      * we later subtract the maximum value from the sum over nstlist steps.
4905      * A zero count will slightly lower the total, but that's a small effect.
4906      * Note that the main purpose of the subtraction of the maximum value
4907      * is to avoid throwing off the load balancing when stalls occur due
4908      * e.g. system activity or network congestion.
4909      */
4910     dd->comm->cycl[ddCycl] += cycles;
4911     dd->comm->cycl_n[ddCycl]++;
4912     if (cycles > dd->comm->cycl_max[ddCycl])
4913     {
4914         dd->comm->cycl_max[ddCycl] = cycles;
4915     }
4916 }
4917
4918 static double force_flop_count(t_nrnb *nrnb)
4919 {
4920     int         i;
4921     double      sum;
4922     const char *name;
4923
4924     sum = 0;
4925     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4926     {
4927         /* To get closer to the real timings, we half the count
4928          * for the normal loops and again half it for water loops.
4929          */
4930         name = nrnb_str(i);
4931         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4932         {
4933             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4934         }
4935         else
4936         {
4937             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4938         }
4939     }
4940     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4941     {
4942         name = nrnb_str(i);
4943         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
4944         {
4945             sum += nrnb->n[i]*cost_nrnb(i);
4946         }
4947     }
4948     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4949     {
4950         sum += nrnb->n[i]*cost_nrnb(i);
4951     }
4952
4953     return sum;
4954 }
4955
4956 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4957 {
4958     if (dd->comm->eFlop)
4959     {
4960         dd->comm->flop -= force_flop_count(nrnb);
4961     }
4962 }
4963 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4964 {
4965     if (dd->comm->eFlop)
4966     {
4967         dd->comm->flop += force_flop_count(nrnb);
4968         dd->comm->flop_n++;
4969     }
4970 }
4971
4972 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4973 {
4974     int i;
4975
4976     for (i = 0; i < ddCyclNr; i++)
4977     {
4978         dd->comm->cycl[i]     = 0;
4979         dd->comm->cycl_n[i]   = 0;
4980         dd->comm->cycl_max[i] = 0;
4981     }
4982     dd->comm->flop   = 0;
4983     dd->comm->flop_n = 0;
4984 }
4985
4986 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4987 {
4988     gmx_domdec_comm_t *comm;
4989     domdec_load_t     *load;
4990     domdec_root_t     *root = NULL;
4991     int                d, dim, i, pos;
4992     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
4993     gmx_bool           bSepPME;
4994
4995     if (debug)
4996     {
4997         fprintf(debug, "get_load_distribution start\n");
4998     }
4999
5000     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5001
5002     comm = dd->comm;
5003
5004     bSepPME = (dd->pme_nodeid >= 0);
5005
5006     if (dd->ndim == 0 && bSepPME)
5007     {
5008         /* Without decomposition, but with PME nodes, we need the load */
5009         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
5010         comm->load[0].pme = comm->cycl[ddCyclPME];
5011     }
5012
5013     for (d = dd->ndim-1; d >= 0; d--)
5014     {
5015         dim = dd->dim[d];
5016         /* Check if we participate in the communication in this dimension */
5017         if (d == dd->ndim-1 ||
5018             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5019         {
5020             load = &comm->load[d];
5021             if (dlbIsOn(dd->comm))
5022             {
5023                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5024             }
5025             pos = 0;
5026             if (d == dd->ndim-1)
5027             {
5028                 sbuf[pos++] = dd_force_load(comm);
5029                 sbuf[pos++] = sbuf[0];
5030                 if (dlbIsOn(dd->comm))
5031                 {
5032                     sbuf[pos++] = sbuf[0];
5033                     sbuf[pos++] = cell_frac;
5034                     if (d > 0)
5035                     {
5036                         sbuf[pos++] = comm->cell_f_max0[d];
5037                         sbuf[pos++] = comm->cell_f_min1[d];
5038                     }
5039                 }
5040                 if (bSepPME)
5041                 {
5042                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5043                     sbuf[pos++] = comm->cycl[ddCyclPME];
5044                 }
5045             }
5046             else
5047             {
5048                 sbuf[pos++] = comm->load[d+1].sum;
5049                 sbuf[pos++] = comm->load[d+1].max;
5050                 if (dlbIsOn(dd->comm))
5051                 {
5052                     sbuf[pos++] = comm->load[d+1].sum_m;
5053                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5054                     sbuf[pos++] = comm->load[d+1].flags;
5055                     if (d > 0)
5056                     {
5057                         sbuf[pos++] = comm->cell_f_max0[d];
5058                         sbuf[pos++] = comm->cell_f_min1[d];
5059                     }
5060                 }
5061                 if (bSepPME)
5062                 {
5063                     sbuf[pos++] = comm->load[d+1].mdf;
5064                     sbuf[pos++] = comm->load[d+1].pme;
5065                 }
5066             }
5067             load->nload = pos;
5068             /* Communicate a row in DD direction d.
5069              * The communicators are setup such that the root always has rank 0.
5070              */
5071 #if GMX_MPI
5072             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5073                        load->load, load->nload*sizeof(float), MPI_BYTE,
5074                        0, comm->mpi_comm_load[d]);
5075 #endif
5076             if (dd->ci[dim] == dd->master_ci[dim])
5077             {
5078                 /* We are the root, process this row */
5079                 if (dlbIsOn(comm))
5080                 {
5081                     root = comm->root[d];
5082                 }
5083                 load->sum      = 0;
5084                 load->max      = 0;
5085                 load->sum_m    = 0;
5086                 load->cvol_min = 1;
5087                 load->flags    = 0;
5088                 load->mdf      = 0;
5089                 load->pme      = 0;
5090                 pos            = 0;
5091                 for (i = 0; i < dd->nc[dim]; i++)
5092                 {
5093                     load->sum += load->load[pos++];
5094                     load->max  = std::max(load->max, load->load[pos]);
5095                     pos++;
5096                     if (dlbIsOn(dd->comm))
5097                     {
5098                         if (root->bLimited)
5099                         {
5100                             /* This direction could not be load balanced properly,
5101                              * therefore we need to use the maximum iso the average load.
5102                              */
5103                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5104                         }
5105                         else
5106                         {
5107                             load->sum_m += load->load[pos];
5108                         }
5109                         pos++;
5110                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5111                         pos++;
5112                         if (d < dd->ndim-1)
5113                         {
5114                             load->flags = (int)(load->load[pos++] + 0.5);
5115                         }
5116                         if (d > 0)
5117                         {
5118                             root->cell_f_max0[i] = load->load[pos++];
5119                             root->cell_f_min1[i] = load->load[pos++];
5120                         }
5121                     }
5122                     if (bSepPME)
5123                     {
5124                         load->mdf = std::max(load->mdf, load->load[pos]);
5125                         pos++;
5126                         load->pme = std::max(load->pme, load->load[pos]);
5127                         pos++;
5128                     }
5129                 }
5130                 if (dlbIsOn(comm) && root->bLimited)
5131                 {
5132                     load->sum_m *= dd->nc[dim];
5133                     load->flags |= (1<<d);
5134                 }
5135             }
5136         }
5137     }
5138
5139     if (DDMASTER(dd))
5140     {
5141         comm->nload      += dd_load_count(comm);
5142         comm->load_step  += comm->cycl[ddCyclStep];
5143         comm->load_sum   += comm->load[0].sum;
5144         comm->load_max   += comm->load[0].max;
5145         if (dlbIsOn(comm))
5146         {
5147             for (d = 0; d < dd->ndim; d++)
5148             {
5149                 if (comm->load[0].flags & (1<<d))
5150                 {
5151                     comm->load_lim[d]++;
5152                 }
5153             }
5154         }
5155         if (bSepPME)
5156         {
5157             comm->load_mdf += comm->load[0].mdf;
5158             comm->load_pme += comm->load[0].pme;
5159         }
5160     }
5161
5162     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5163
5164     if (debug)
5165     {
5166         fprintf(debug, "get_load_distribution finished\n");
5167     }
5168 }
5169
5170 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5171 {
5172     /* Return the relative performance loss on the total run time
5173      * due to the force calculation load imbalance.
5174      */
5175     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5176     {
5177         return
5178             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5179             (dd->comm->load_step*dd->nnodes);
5180     }
5181     else
5182     {
5183         return 0;
5184     }
5185 }
5186
5187 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5188 {
5189     char               buf[STRLEN];
5190     int                npp, npme, nnodes, d, limp;
5191     float              imbal, pme_f_ratio, lossf = 0, lossp = 0;
5192     gmx_bool           bLim;
5193     gmx_domdec_comm_t *comm;
5194
5195     comm = dd->comm;
5196     if (DDMASTER(dd) && comm->nload > 0)
5197     {
5198         npp    = dd->nnodes;
5199         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5200         nnodes = npp + npme;
5201         if (dd->nnodes > 1 && comm->load_sum > 0)
5202         {
5203             imbal  = comm->load_max*npp/comm->load_sum - 1;
5204             lossf  = dd_force_imb_perf_loss(dd);
5205             sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5206             fprintf(fplog, "%s", buf);
5207             fprintf(stderr, "\n");
5208             fprintf(stderr, "%s", buf);
5209             sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5210             fprintf(fplog, "%s", buf);
5211             fprintf(stderr, "%s", buf);
5212         }
5213         bLim = FALSE;
5214         if (dlbIsOn(comm))
5215         {
5216             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5217             for (d = 0; d < dd->ndim; d++)
5218             {
5219                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5220                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5221                 if (limp >= 50)
5222                 {
5223                     bLim = TRUE;
5224                 }
5225             }
5226             sprintf(buf+strlen(buf), "\n");
5227             fprintf(fplog, "%s", buf);
5228             fprintf(stderr, "%s", buf);
5229         }
5230         if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5231         {
5232             pme_f_ratio = comm->load_pme/comm->load_mdf;
5233             lossp       = (comm->load_pme - comm->load_mdf)/comm->load_step;
5234             if (lossp <= 0)
5235             {
5236                 lossp *= (float)npme/(float)nnodes;
5237             }
5238             else
5239             {
5240                 lossp *= (float)npp/(float)nnodes;
5241             }
5242             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5243             fprintf(fplog, "%s", buf);
5244             fprintf(stderr, "%s", buf);
5245             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5246             fprintf(fplog, "%s", buf);
5247             fprintf(stderr, "%s", buf);
5248         }
5249         fprintf(fplog, "\n");
5250         fprintf(stderr, "\n");
5251
5252         if (lossf >= DD_PERF_LOSS_WARN)
5253         {
5254             sprintf(buf,
5255                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5256                     "      in the domain decomposition.\n", lossf*100);
5257             if (!dlbIsOn(comm))
5258             {
5259                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5260             }
5261             else if (bLim)
5262             {
5263                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5264             }
5265             fprintf(fplog, "%s\n", buf);
5266             fprintf(stderr, "%s\n", buf);
5267         }
5268         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5269         {
5270             sprintf(buf,
5271                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5272                     "      had %s work to do than the PP ranks.\n"
5273                     "      You might want to %s the number of PME ranks\n"
5274                     "      or %s the cut-off and the grid spacing.\n",
5275                     fabs(lossp*100),
5276                     (lossp < 0) ? "less"     : "more",
5277                     (lossp < 0) ? "decrease" : "increase",
5278                     (lossp < 0) ? "decrease" : "increase");
5279             fprintf(fplog, "%s\n", buf);
5280             fprintf(stderr, "%s\n", buf);
5281         }
5282     }
5283 }
5284
5285 static float dd_vol_min(gmx_domdec_t *dd)
5286 {
5287     return dd->comm->load[0].cvol_min*dd->nnodes;
5288 }
5289
5290 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5291 {
5292     return dd->comm->load[0].flags;
5293 }
5294
5295 static float dd_f_imbal(gmx_domdec_t *dd)
5296 {
5297     if (dd->comm->load[0].sum > 0)
5298     {
5299         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5300     }
5301     else
5302     {
5303         /* Something is wrong in the cycle counting, report no load imbalance */
5304         return 0.0f;
5305     }
5306 }
5307
5308 float dd_pme_f_ratio(gmx_domdec_t *dd)
5309 {
5310     /* Should only be called on the DD master rank */
5311     assert(DDMASTER(dd));
5312
5313     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5314     {
5315         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5316     }
5317     else
5318     {
5319         return -1.0;
5320     }
5321 }
5322
5323 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5324 {
5325     int  flags, d;
5326     char buf[22];
5327
5328     flags = dd_load_flags(dd);
5329     if (flags)
5330     {
5331         fprintf(fplog,
5332                 "DD  load balancing is limited by minimum cell size in dimension");
5333         for (d = 0; d < dd->ndim; d++)
5334         {
5335             if (flags & (1<<d))
5336             {
5337                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5338             }
5339         }
5340         fprintf(fplog, "\n");
5341     }
5342     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5343     if (dlbIsOn(dd->comm))
5344     {
5345         fprintf(fplog, "  vol min/aver %5.3f%c",
5346                 dd_vol_min(dd), flags ? '!' : ' ');
5347     }
5348     if (dd->nnodes > 1)
5349     {
5350         fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5351     }
5352     if (dd->comm->cycl_n[ddCyclPME])
5353     {
5354         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5355     }
5356     fprintf(fplog, "\n\n");
5357 }
5358
5359 static void dd_print_load_verbose(gmx_domdec_t *dd)
5360 {
5361     if (dlbIsOn(dd->comm))
5362     {
5363         fprintf(stderr, "vol %4.2f%c ",
5364                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5365     }
5366     if (dd->nnodes > 1)
5367     {
5368         fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5369     }
5370     if (dd->comm->cycl_n[ddCyclPME])
5371     {
5372         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5373     }
5374 }
5375
5376 #if GMX_MPI
5377 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5378 {
5379     MPI_Comm           c_row;
5380     int                dim, i, rank;
5381     ivec               loc_c;
5382     domdec_root_t     *root;
5383     gmx_bool           bPartOfGroup = FALSE;
5384
5385     dim = dd->dim[dim_ind];
5386     copy_ivec(loc, loc_c);
5387     for (i = 0; i < dd->nc[dim]; i++)
5388     {
5389         loc_c[dim] = i;
5390         rank       = dd_index(dd->nc, loc_c);
5391         if (rank == dd->rank)
5392         {
5393             /* This process is part of the group */
5394             bPartOfGroup = TRUE;
5395         }
5396     }
5397     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5398                    &c_row);
5399     if (bPartOfGroup)
5400     {
5401         dd->comm->mpi_comm_load[dim_ind] = c_row;
5402         if (dd->comm->dlbState != edlbsOffForever)
5403         {
5404             if (dd->ci[dim] == dd->master_ci[dim])
5405             {
5406                 /* This is the root process of this row */
5407                 snew(dd->comm->root[dim_ind], 1);
5408                 root = dd->comm->root[dim_ind];
5409                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5410                 snew(root->old_cell_f, dd->nc[dim]+1);
5411                 snew(root->bCellMin, dd->nc[dim]);
5412                 if (dim_ind > 0)
5413                 {
5414                     snew(root->cell_f_max0, dd->nc[dim]);
5415                     snew(root->cell_f_min1, dd->nc[dim]);
5416                     snew(root->bound_min, dd->nc[dim]);
5417                     snew(root->bound_max, dd->nc[dim]);
5418                 }
5419                 snew(root->buf_ncd, dd->nc[dim]);
5420             }
5421             else
5422             {
5423                 /* This is not a root process, we only need to receive cell_f */
5424                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5425             }
5426         }
5427         if (dd->ci[dim] == dd->master_ci[dim])
5428         {
5429             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5430         }
5431     }
5432 }
5433 #endif
5434
5435 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5436                                    const gmx_hw_info_t gmx_unused *hwinfo,
5437                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5438 {
5439 #if GMX_MPI
5440     int           physicalnode_id_hash;
5441     int           gpu_id;
5442     gmx_domdec_t *dd;
5443     MPI_Comm      mpi_comm_pp_physicalnode;
5444
5445     if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5446     {
5447         /* Only PP nodes (currently) use GPUs.
5448          * If we don't have GPUs, there are no resources to share.
5449          */
5450         return;
5451     }
5452
5453     physicalnode_id_hash = gmx_physicalnode_id_hash();
5454
5455     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5456
5457     dd = cr->dd;
5458
5459     if (debug)
5460     {
5461         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5462         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5463                 dd->rank, physicalnode_id_hash, gpu_id);
5464     }
5465     /* Split the PP communicator over the physical nodes */
5466     /* TODO: See if we should store this (before), as it's also used for
5467      * for the nodecomm summution.
5468      */
5469     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5470                    &mpi_comm_pp_physicalnode);
5471     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5472                    &dd->comm->mpi_comm_gpu_shared);
5473     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5474     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5475
5476     if (debug)
5477     {
5478         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5479     }
5480
5481     /* Note that some ranks could share a GPU, while others don't */
5482
5483     if (dd->comm->nrank_gpu_shared == 1)
5484     {
5485         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5486     }
5487 #endif
5488 }
5489
5490 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5491 {
5492 #if GMX_MPI
5493     int  dim0, dim1, i, j;
5494     ivec loc;
5495
5496     if (debug)
5497     {
5498         fprintf(debug, "Making load communicators\n");
5499     }
5500
5501     snew(dd->comm->load,          std::max(dd->ndim, 1));
5502     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5503
5504     if (dd->ndim == 0)
5505     {
5506         return;
5507     }
5508
5509     clear_ivec(loc);
5510     make_load_communicator(dd, 0, loc);
5511     if (dd->ndim > 1)
5512     {
5513         dim0 = dd->dim[0];
5514         for (i = 0; i < dd->nc[dim0]; i++)
5515         {
5516             loc[dim0] = i;
5517             make_load_communicator(dd, 1, loc);
5518         }
5519     }
5520     if (dd->ndim > 2)
5521     {
5522         dim0 = dd->dim[0];
5523         for (i = 0; i < dd->nc[dim0]; i++)
5524         {
5525             loc[dim0] = i;
5526             dim1      = dd->dim[1];
5527             for (j = 0; j < dd->nc[dim1]; j++)
5528             {
5529                 loc[dim1] = j;
5530                 make_load_communicator(dd, 2, loc);
5531             }
5532         }
5533     }
5534
5535     if (debug)
5536     {
5537         fprintf(debug, "Finished making load communicators\n");
5538     }
5539 #endif
5540 }
5541
5542 /*! \brief Sets up the relation between neighboring domains and zones */
5543 static void setup_neighbor_relations(gmx_domdec_t *dd)
5544 {
5545     int                     d, dim, i, j, m;
5546     ivec                    tmp, s;
5547     gmx_domdec_zones_t     *zones;
5548     gmx_domdec_ns_ranges_t *izone;
5549
5550     for (d = 0; d < dd->ndim; d++)
5551     {
5552         dim = dd->dim[d];
5553         copy_ivec(dd->ci, tmp);
5554         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5555         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5556         copy_ivec(dd->ci, tmp);
5557         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5558         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5559         if (debug)
5560         {
5561             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5562                     dd->rank, dim,
5563                     dd->neighbor[d][0],
5564                     dd->neighbor[d][1]);
5565         }
5566     }
5567
5568     int nzone  = (1 << dd->ndim);
5569     int nizone = (1 << std::max(dd->ndim - 1, 0));
5570     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5571
5572     zones = &dd->comm->zones;
5573
5574     for (i = 0; i < nzone; i++)
5575     {
5576         m = 0;
5577         clear_ivec(zones->shift[i]);
5578         for (d = 0; d < dd->ndim; d++)
5579         {
5580             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5581         }
5582     }
5583
5584     zones->n = nzone;
5585     for (i = 0; i < nzone; i++)
5586     {
5587         for (d = 0; d < DIM; d++)
5588         {
5589             s[d] = dd->ci[d] - zones->shift[i][d];
5590             if (s[d] < 0)
5591             {
5592                 s[d] += dd->nc[d];
5593             }
5594             else if (s[d] >= dd->nc[d])
5595             {
5596                 s[d] -= dd->nc[d];
5597             }
5598         }
5599     }
5600     zones->nizone = nizone;
5601     for (i = 0; i < zones->nizone; i++)
5602     {
5603         assert(ddNonbondedZonePairRanges[i][0] == i);
5604
5605         izone     = &zones->izone[i];
5606         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5607          * j-zones up to nzone.
5608          */
5609         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5610         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5611         for (dim = 0; dim < DIM; dim++)
5612         {
5613             if (dd->nc[dim] == 1)
5614             {
5615                 /* All shifts should be allowed */
5616                 izone->shift0[dim] = -1;
5617                 izone->shift1[dim] = 1;
5618             }
5619             else
5620             {
5621                 /* Determine the min/max j-zone shift wrt the i-zone */
5622                 izone->shift0[dim] = 1;
5623                 izone->shift1[dim] = -1;
5624                 for (j = izone->j0; j < izone->j1; j++)
5625                 {
5626                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5627                     if (shift_diff < izone->shift0[dim])
5628                     {
5629                         izone->shift0[dim] = shift_diff;
5630                     }
5631                     if (shift_diff > izone->shift1[dim])
5632                     {
5633                         izone->shift1[dim] = shift_diff;
5634                     }
5635                 }
5636             }
5637         }
5638     }
5639
5640     if (dd->comm->dlbState != edlbsOffForever)
5641     {
5642         snew(dd->comm->root, dd->ndim);
5643     }
5644
5645     if (dd->comm->bRecordLoad)
5646     {
5647         make_load_communicators(dd);
5648     }
5649 }
5650
5651 static void make_pp_communicator(FILE                 *fplog,
5652                                  gmx_domdec_t         *dd,
5653                                  t_commrec gmx_unused *cr,
5654                                  int gmx_unused        reorder)
5655 {
5656 #if GMX_MPI
5657     gmx_domdec_comm_t *comm;
5658     int                rank, *buf;
5659     ivec               periods;
5660     MPI_Comm           comm_cart;
5661
5662     comm = dd->comm;
5663
5664     if (comm->bCartesianPP)
5665     {
5666         /* Set up cartesian communication for the particle-particle part */
5667         if (fplog)
5668         {
5669             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5670                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5671         }
5672
5673         for (int i = 0; i < DIM; i++)
5674         {
5675             periods[i] = TRUE;
5676         }
5677         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5678                         &comm_cart);
5679         /* We overwrite the old communicator with the new cartesian one */
5680         cr->mpi_comm_mygroup = comm_cart;
5681     }
5682
5683     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5684     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5685
5686     if (comm->bCartesianPP_PME)
5687     {
5688         /* Since we want to use the original cartesian setup for sim,
5689          * and not the one after split, we need to make an index.
5690          */
5691         snew(comm->ddindex2ddnodeid, dd->nnodes);
5692         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5693         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5694         /* Get the rank of the DD master,
5695          * above we made sure that the master node is a PP node.
5696          */
5697         if (MASTER(cr))
5698         {
5699             rank = dd->rank;
5700         }
5701         else
5702         {
5703             rank = 0;
5704         }
5705         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5706     }
5707     else if (comm->bCartesianPP)
5708     {
5709         if (cr->npmenodes == 0)
5710         {
5711             /* The PP communicator is also
5712              * the communicator for this simulation
5713              */
5714             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5715         }
5716         cr->nodeid = dd->rank;
5717
5718         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5719
5720         /* We need to make an index to go from the coordinates
5721          * to the nodeid of this simulation.
5722          */
5723         snew(comm->ddindex2simnodeid, dd->nnodes);
5724         snew(buf, dd->nnodes);
5725         if (cr->duty & DUTY_PP)
5726         {
5727             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5728         }
5729         /* Communicate the ddindex to simulation nodeid index */
5730         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5731                       cr->mpi_comm_mysim);
5732         sfree(buf);
5733
5734         /* Determine the master coordinates and rank.
5735          * The DD master should be the same node as the master of this sim.
5736          */
5737         for (int i = 0; i < dd->nnodes; i++)
5738         {
5739             if (comm->ddindex2simnodeid[i] == 0)
5740             {
5741                 ddindex2xyz(dd->nc, i, dd->master_ci);
5742                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5743             }
5744         }
5745         if (debug)
5746         {
5747             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5748         }
5749     }
5750     else
5751     {
5752         /* No Cartesian communicators */
5753         /* We use the rank in dd->comm->all as DD index */
5754         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5755         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5756         dd->masterrank = 0;
5757         clear_ivec(dd->master_ci);
5758     }
5759 #endif
5760
5761     if (fplog)
5762     {
5763         fprintf(fplog,
5764                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5765                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5766     }
5767     if (debug)
5768     {
5769         fprintf(debug,
5770                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5771                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5772     }
5773 }
5774
5775 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
5776                                       t_commrec            *cr)
5777 {
5778 #if GMX_MPI
5779     gmx_domdec_comm_t *comm = dd->comm;
5780
5781     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5782     {
5783         int *buf;
5784         snew(comm->ddindex2simnodeid, dd->nnodes);
5785         snew(buf, dd->nnodes);
5786         if (cr->duty & DUTY_PP)
5787         {
5788             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5789         }
5790         /* Communicate the ddindex to simulation nodeid index */
5791         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5792                       cr->mpi_comm_mysim);
5793         sfree(buf);
5794     }
5795 #else
5796     GMX_UNUSED_VALUE(dd);
5797     GMX_UNUSED_VALUE(cr);
5798 #endif
5799 }
5800
5801 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5802                                                      int ncg, int natoms)
5803 {
5804     gmx_domdec_master_t *ma;
5805     int                  i;
5806
5807     snew(ma, 1);
5808
5809     snew(ma->ncg, dd->nnodes);
5810     snew(ma->index, dd->nnodes+1);
5811     snew(ma->cg, ncg);
5812     snew(ma->nat, dd->nnodes);
5813     snew(ma->ibuf, dd->nnodes*2);
5814     snew(ma->cell_x, DIM);
5815     for (i = 0; i < DIM; i++)
5816     {
5817         snew(ma->cell_x[i], dd->nc[i]+1);
5818     }
5819
5820     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5821     {
5822         ma->vbuf = NULL;
5823     }
5824     else
5825     {
5826         snew(ma->vbuf, natoms);
5827     }
5828
5829     return ma;
5830 }
5831
5832 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5833                                int gmx_unused dd_rank_order,
5834                                int gmx_unused reorder)
5835 {
5836     gmx_domdec_comm_t *comm;
5837     int                i;
5838     gmx_bool           bDiv[DIM];
5839 #if GMX_MPI
5840     MPI_Comm           comm_cart;
5841 #endif
5842
5843     comm = dd->comm;
5844
5845     if (comm->bCartesianPP)
5846     {
5847         for (i = 1; i < DIM; i++)
5848         {
5849             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5850         }
5851         if (bDiv[YY] || bDiv[ZZ])
5852         {
5853             comm->bCartesianPP_PME = TRUE;
5854             /* If we have 2D PME decomposition, which is always in x+y,
5855              * we stack the PME only nodes in z.
5856              * Otherwise we choose the direction that provides the thinnest slab
5857              * of PME only nodes as this will have the least effect
5858              * on the PP communication.
5859              * But for the PME communication the opposite might be better.
5860              */
5861             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5862                              !bDiv[YY] ||
5863                              dd->nc[YY] > dd->nc[ZZ]))
5864             {
5865                 comm->cartpmedim = ZZ;
5866             }
5867             else
5868             {
5869                 comm->cartpmedim = YY;
5870             }
5871             comm->ntot[comm->cartpmedim]
5872                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5873         }
5874         else if (fplog)
5875         {
5876             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5877             fprintf(fplog,
5878                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5879         }
5880     }
5881
5882 #if GMX_MPI
5883     if (comm->bCartesianPP_PME)
5884     {
5885         int  rank;
5886         ivec periods;
5887
5888         if (fplog)
5889         {
5890             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5891         }
5892
5893         for (i = 0; i < DIM; i++)
5894         {
5895             periods[i] = TRUE;
5896         }
5897         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5898                         &comm_cart);
5899         MPI_Comm_rank(comm_cart, &rank);
5900         if (MASTER(cr) && rank != 0)
5901         {
5902             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5903         }
5904
5905         /* With this assigment we loose the link to the original communicator
5906          * which will usually be MPI_COMM_WORLD, unless have multisim.
5907          */
5908         cr->mpi_comm_mysim = comm_cart;
5909         cr->sim_nodeid     = rank;
5910
5911         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5912
5913         if (fplog)
5914         {
5915             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5916                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5917         }
5918
5919         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5920         {
5921             cr->duty = DUTY_PP;
5922         }
5923         if (cr->npmenodes == 0 ||
5924             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5925         {
5926             cr->duty = DUTY_PME;
5927         }
5928
5929         /* Split the sim communicator into PP and PME only nodes */
5930         MPI_Comm_split(cr->mpi_comm_mysim,
5931                        cr->duty,
5932                        dd_index(comm->ntot, dd->ci),
5933                        &cr->mpi_comm_mygroup);
5934     }
5935     else
5936     {
5937         switch (dd_rank_order)
5938         {
5939             case ddrankorderPP_PME:
5940                 if (fplog)
5941                 {
5942                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5943                 }
5944                 break;
5945             case ddrankorderINTERLEAVE:
5946                 /* Interleave the PP-only and PME-only ranks */
5947                 if (fplog)
5948                 {
5949                     fprintf(fplog, "Interleaving PP and PME ranks\n");
5950                 }
5951                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5952                 break;
5953             case ddrankorderCARTESIAN:
5954                 break;
5955             default:
5956                 gmx_fatal(FARGS, "Unknown dd_rank_order=%d", dd_rank_order);
5957         }
5958
5959         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5960         {
5961             cr->duty = DUTY_PME;
5962         }
5963         else
5964         {
5965             cr->duty = DUTY_PP;
5966         }
5967
5968         /* Split the sim communicator into PP and PME only nodes */
5969         MPI_Comm_split(cr->mpi_comm_mysim,
5970                        cr->duty,
5971                        cr->nodeid,
5972                        &cr->mpi_comm_mygroup);
5973         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5974     }
5975 #endif
5976
5977     if (fplog)
5978     {
5979         fprintf(fplog, "This rank does only %s work.\n\n",
5980                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5981     }
5982 }
5983
5984 /*! \brief Generates the MPI communicators for domain decomposition */
5985 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5986                                   gmx_domdec_t *dd, int dd_rank_order)
5987 {
5988     gmx_domdec_comm_t *comm;
5989     int                CartReorder;
5990
5991     comm = dd->comm;
5992
5993     copy_ivec(dd->nc, comm->ntot);
5994
5995     comm->bCartesianPP     = (dd_rank_order == ddrankorderCARTESIAN);
5996     comm->bCartesianPP_PME = FALSE;
5997
5998     /* Reorder the nodes by default. This might change the MPI ranks.
5999      * Real reordering is only supported on very few architectures,
6000      * Blue Gene is one of them.
6001      */
6002     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6003
6004     if (cr->npmenodes > 0)
6005     {
6006         /* Split the communicator into a PP and PME part */
6007         split_communicator(fplog, cr, dd, dd_rank_order, CartReorder);
6008         if (comm->bCartesianPP_PME)
6009         {
6010             /* We (possibly) reordered the nodes in split_communicator,
6011              * so it is no longer required in make_pp_communicator.
6012              */
6013             CartReorder = FALSE;
6014         }
6015     }
6016     else
6017     {
6018         /* All nodes do PP and PME */
6019 #if GMX_MPI
6020         /* We do not require separate communicators */
6021         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6022 #endif
6023     }
6024
6025     if (cr->duty & DUTY_PP)
6026     {
6027         /* Copy or make a new PP communicator */
6028         make_pp_communicator(fplog, dd, cr, CartReorder);
6029     }
6030     else
6031     {
6032         receive_ddindex2simnodeid(dd, cr);
6033     }
6034
6035     if (!(cr->duty & DUTY_PME))
6036     {
6037         /* Set up the commnuication to our PME node */
6038         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
6039         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
6040         if (debug)
6041         {
6042             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6043                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6044         }
6045     }
6046     else
6047     {
6048         dd->pme_nodeid = -1;
6049     }
6050
6051     if (DDMASTER(dd))
6052     {
6053         dd->ma = init_gmx_domdec_master_t(dd,
6054                                           comm->cgs_gl.nr,
6055                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6056     }
6057 }
6058
6059 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6060 {
6061     real  *slb_frac, tot;
6062     int    i, n;
6063     double dbl;
6064
6065     slb_frac = NULL;
6066     if (nc > 1 && size_string != NULL)
6067     {
6068         if (fplog)
6069         {
6070             fprintf(fplog, "Using static load balancing for the %s direction\n",
6071                     dir);
6072         }
6073         snew(slb_frac, nc);
6074         tot = 0;
6075         for (i = 0; i < nc; i++)
6076         {
6077             dbl = 0;
6078             sscanf(size_string, "%20lf%n", &dbl, &n);
6079             if (dbl == 0)
6080             {
6081                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6082             }
6083             slb_frac[i]  = dbl;
6084             size_string += n;
6085             tot         += slb_frac[i];
6086         }
6087         /* Normalize */
6088         if (fplog)
6089         {
6090             fprintf(fplog, "Relative cell sizes:");
6091         }
6092         for (i = 0; i < nc; i++)
6093         {
6094             slb_frac[i] /= tot;
6095             if (fplog)
6096             {
6097                 fprintf(fplog, " %5.3f", slb_frac[i]);
6098             }
6099         }
6100         if (fplog)
6101         {
6102             fprintf(fplog, "\n");
6103         }
6104     }
6105
6106     return slb_frac;
6107 }
6108
6109 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6110 {
6111     int                  n, nmol, ftype;
6112     gmx_mtop_ilistloop_t iloop;
6113     t_ilist             *il;
6114
6115     n     = 0;
6116     iloop = gmx_mtop_ilistloop_init(mtop);
6117     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6118     {
6119         for (ftype = 0; ftype < F_NRE; ftype++)
6120         {
6121             if ((interaction_function[ftype].flags & IF_BOND) &&
6122                 NRAL(ftype) >  2)
6123             {
6124                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6125             }
6126         }
6127     }
6128
6129     return n;
6130 }
6131
6132 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6133 {
6134     char *val;
6135     int   nst;
6136
6137     nst = def;
6138     val = getenv(env_var);
6139     if (val)
6140     {
6141         if (sscanf(val, "%20d", &nst) <= 0)
6142         {
6143             nst = 1;
6144         }
6145         if (fplog)
6146         {
6147             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6148                     env_var, val, nst);
6149         }
6150     }
6151
6152     return nst;
6153 }
6154
6155 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6156 {
6157     if (MASTER(cr))
6158     {
6159         fprintf(stderr, "\n%s\n", warn_string);
6160     }
6161     if (fplog)
6162     {
6163         fprintf(fplog, "\n%s\n", warn_string);
6164     }
6165 }
6166
6167 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6168                                   const t_inputrec *ir, FILE *fplog)
6169 {
6170     if (ir->ePBC == epbcSCREW &&
6171         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6172     {
6173         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6174     }
6175
6176     if (ir->ns_type == ensSIMPLE)
6177     {
6178         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6179     }
6180
6181     if (ir->nstlist == 0)
6182     {
6183         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6184     }
6185
6186     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6187     {
6188         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6189     }
6190 }
6191
6192 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6193 {
6194     int  di, d;
6195     real r;
6196
6197     r = ddbox->box_size[XX];
6198     for (di = 0; di < dd->ndim; di++)
6199     {
6200         d = dd->dim[di];
6201         /* Check using the initial average cell size */
6202         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6203     }
6204
6205     return r;
6206 }
6207
6208 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6209                              const char *dlb_opt, gmx_bool bRecordLoad,
6210                              unsigned long Flags, const t_inputrec *ir)
6211 {
6212     int           dlbState = -1;
6213     char          buf[STRLEN];
6214
6215     switch (dlb_opt[0])
6216     {
6217         case 'a': dlbState = edlbsOffCanTurnOn; break;
6218         case 'n': dlbState = edlbsOffForever;   break;
6219         case 'y': dlbState = edlbsOnForever;    break;
6220         default: gmx_incons("Unknown dlb_opt");
6221     }
6222
6223     if (Flags & MD_RERUN)
6224     {
6225         return edlbsOffForever;
6226     }
6227
6228     if (!EI_DYNAMICS(ir->eI))
6229     {
6230         if (dlbState == edlbsOnForever)
6231         {
6232             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6233             dd_warning(cr, fplog, buf);
6234         }
6235
6236         return edlbsOffForever;
6237     }
6238
6239     if (!bRecordLoad)
6240     {
6241         dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6242         return edlbsOffForever;
6243     }
6244
6245     if (Flags & MD_REPRODUCIBLE)
6246     {
6247         switch (dlbState)
6248         {
6249             case edlbsOffForever:
6250                 break;
6251             case edlbsOffCanTurnOn:
6252             case edlbsOnCanTurnOff:
6253                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6254                 dlbState = edlbsOffForever;
6255                 break;
6256             case edlbsOnForever:
6257                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6258                 break;
6259             default:
6260                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6261                 break;
6262         }
6263     }
6264
6265     return dlbState;
6266 }
6267
6268 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6269 {
6270     int dim;
6271
6272     dd->ndim = 0;
6273     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6274     {
6275         /* Decomposition order z,y,x */
6276         if (fplog)
6277         {
6278             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6279         }
6280         for (dim = DIM-1; dim >= 0; dim--)
6281         {
6282             if (dd->nc[dim] > 1)
6283             {
6284                 dd->dim[dd->ndim++] = dim;
6285             }
6286         }
6287     }
6288     else
6289     {
6290         /* Decomposition order x,y,z */
6291         for (dim = 0; dim < DIM; dim++)
6292         {
6293             if (dd->nc[dim] > 1)
6294             {
6295                 dd->dim[dd->ndim++] = dim;
6296             }
6297         }
6298     }
6299 }
6300
6301 static gmx_domdec_comm_t *init_dd_comm()
6302 {
6303     gmx_domdec_comm_t *comm;
6304     int                i;
6305
6306     snew(comm, 1);
6307     snew(comm->cggl_flag, DIM*2);
6308     snew(comm->cgcm_state, DIM*2);
6309     for (i = 0; i < DIM*2; i++)
6310     {
6311         comm->cggl_flag_nalloc[i]  = 0;
6312         comm->cgcm_state_nalloc[i] = 0;
6313     }
6314
6315     comm->nalloc_int = 0;
6316     comm->buf_int    = NULL;
6317
6318     vec_rvec_init(&comm->vbuf);
6319
6320     comm->n_load_have    = 0;
6321     comm->n_load_collect = 0;
6322
6323     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6324     {
6325         comm->sum_nat[i] = 0;
6326     }
6327     comm->ndecomp   = 0;
6328     comm->nload     = 0;
6329     comm->load_step = 0;
6330     comm->load_sum  = 0;
6331     comm->load_max  = 0;
6332     clear_ivec(comm->load_lim);
6333     comm->load_mdf  = 0;
6334     comm->load_pme  = 0;
6335
6336     return comm;
6337 }
6338
6339 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6340 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6341                                    unsigned long Flags,
6342                                    ivec nc, int nPmeRanks,
6343                                    real comm_distance_min, real rconstr,
6344                                    const char *dlb_opt, real dlb_scale,
6345                                    const char *sizex, const char *sizey, const char *sizez,
6346                                    const gmx_mtop_t *mtop,
6347                                    const t_inputrec *ir,
6348                                    matrix box, const rvec *x,
6349                                    gmx_ddbox_t *ddbox,
6350                                    int *npme_x, int *npme_y)
6351 {
6352     real               r_bonded         = -1;
6353     real               r_bonded_limit   = -1;
6354     const real         tenPercentMargin = 1.1;
6355     gmx_domdec_comm_t *comm             = dd->comm;
6356
6357     snew(comm->cggl_flag, DIM*2);
6358     snew(comm->cgcm_state, DIM*2);
6359
6360     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6361     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6362
6363     dd->pme_recv_f_alloc = 0;
6364     dd->pme_recv_f_buf   = NULL;
6365
6366     /* Initialize to GPU share count to 0, might change later */
6367     comm->nrank_gpu_shared = 0;
6368
6369     comm->dlbState         = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6370     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6371     /* To consider turning DLB on after 2*nstlist steps we need to check
6372      * at partitioning count 3. Thus we need to increase the first count by 2.
6373      */
6374     comm->ddPartioningCountFirstDlbOff += 2;
6375
6376     if (fplog)
6377     {
6378         fprintf(fplog, "Dynamic load balancing: %s\n",
6379                 edlbs_names[comm->dlbState]);
6380     }
6381     comm->bPMELoadBalDLBLimits = FALSE;
6382
6383     /* Allocate the charge group/atom sorting struct */
6384     snew(comm->sort, 1);
6385
6386     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6387
6388     comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6389                              mtop->bIntermolecularInteractions);
6390     if (comm->bInterCGBondeds)
6391     {
6392         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6393     }
6394     else
6395     {
6396         comm->bInterCGMultiBody = FALSE;
6397     }
6398
6399     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6400     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6401
6402     if (ir->rlist == 0)
6403     {
6404         /* Set the cut-off to some very large value,
6405          * so we don't need if statements everywhere in the code.
6406          * We use sqrt, since the cut-off is squared in some places.
6407          */
6408         comm->cutoff   = GMX_CUTOFF_INF;
6409     }
6410     else
6411     {
6412         comm->cutoff   = ir->rlist;
6413     }
6414     comm->cutoff_mbody = 0;
6415
6416     comm->cellsize_limit = 0;
6417     comm->bBondComm      = FALSE;
6418
6419     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6420      * within nstlist steps. Since boundaries are allowed to displace by half
6421      * a cell size, DD cells should be at least the size of the list buffer.
6422      */
6423     comm->cellsize_limit = std::max(comm->cellsize_limit,
6424                                     ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6425
6426     if (comm->bInterCGBondeds)
6427     {
6428         if (comm_distance_min > 0)
6429         {
6430             comm->cutoff_mbody = comm_distance_min;
6431             if (Flags & MD_DDBONDCOMM)
6432             {
6433                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6434             }
6435             else
6436             {
6437                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6438             }
6439             r_bonded_limit = comm->cutoff_mbody;
6440         }
6441         else if (ir->bPeriodicMols)
6442         {
6443             /* Can not easily determine the required cut-off */
6444             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6445             comm->cutoff_mbody = comm->cutoff/2;
6446             r_bonded_limit     = comm->cutoff_mbody;
6447         }
6448         else
6449         {
6450             real r_2b, r_mb;
6451
6452             if (MASTER(cr))
6453             {
6454                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6455                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6456             }
6457             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6458             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6459
6460             /* We use an initial margin of 10% for the minimum cell size,
6461              * except when we are just below the non-bonded cut-off.
6462              */
6463             if (Flags & MD_DDBONDCOMM)
6464             {
6465                 if (std::max(r_2b, r_mb) > comm->cutoff)
6466                 {
6467                     r_bonded        = std::max(r_2b, r_mb);
6468                     r_bonded_limit  = tenPercentMargin*r_bonded;
6469                     comm->bBondComm = TRUE;
6470                 }
6471                 else
6472                 {
6473                     r_bonded       = r_mb;
6474                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6475                 }
6476                 /* We determine cutoff_mbody later */
6477             }
6478             else
6479             {
6480                 /* No special bonded communication,
6481                  * simply increase the DD cut-off.
6482                  */
6483                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6484                 comm->cutoff_mbody = r_bonded_limit;
6485                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6486             }
6487         }
6488         if (fplog)
6489         {
6490             fprintf(fplog,
6491                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6492                     r_bonded_limit);
6493         }
6494         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6495     }
6496
6497     if (dd->bInterCGcons && rconstr <= 0)
6498     {
6499         /* There is a cell size limit due to the constraints (P-LINCS) */
6500         rconstr = constr_r_max(fplog, mtop, ir);
6501         if (fplog)
6502         {
6503             fprintf(fplog,
6504                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6505                     rconstr);
6506             if (rconstr > comm->cellsize_limit)
6507             {
6508                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6509             }
6510         }
6511     }
6512     else if (rconstr > 0 && fplog)
6513     {
6514         /* Here we do not check for dd->bInterCGcons,
6515          * because one can also set a cell size limit for virtual sites only
6516          * and at this point we don't know yet if there are intercg v-sites.
6517          */
6518         fprintf(fplog,
6519                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6520                 rconstr);
6521     }
6522     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6523
6524     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6525
6526     if (nc[XX] > 0)
6527     {
6528         copy_ivec(nc, dd->nc);
6529         set_dd_dim(fplog, dd);
6530         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6531
6532         if (nPmeRanks >= 0)
6533         {
6534             cr->npmenodes = nPmeRanks;
6535         }
6536         else
6537         {
6538             /* When the DD grid is set explicitly and -npme is set to auto,
6539              * don't use PME ranks. We check later if the DD grid is
6540              * compatible with the total number of ranks.
6541              */
6542             cr->npmenodes = 0;
6543         }
6544
6545         real acs = average_cellsize_min(dd, ddbox);
6546         if (acs < comm->cellsize_limit)
6547         {
6548             if (fplog)
6549             {
6550                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6551             }
6552             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6553                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6554                                  acs, comm->cellsize_limit);
6555         }
6556     }
6557     else
6558     {
6559         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6560
6561         /* We need to choose the optimal DD grid and possibly PME nodes */
6562         real limit =
6563             dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6564                            nPmeRanks,
6565                            comm->dlbState != edlbsOffForever, dlb_scale,
6566                            comm->cellsize_limit, comm->cutoff,
6567                            comm->bInterCGBondeds);
6568
6569         if (dd->nc[XX] == 0)
6570         {
6571             char     buf[STRLEN];
6572             gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6573             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6574                     !bC ? "-rdd" : "-rcon",
6575                     comm->dlbState != edlbsOffForever ? " or -dds" : "",
6576                     bC ? " or your LINCS settings" : "");
6577
6578             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6579                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6580                                  "%s\n"
6581                                  "Look in the log file for details on the domain decomposition",
6582                                  cr->nnodes-cr->npmenodes, limit, buf);
6583         }
6584         set_dd_dim(fplog, dd);
6585     }
6586
6587     if (fplog)
6588     {
6589         fprintf(fplog,
6590                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6591                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6592     }
6593
6594     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6595     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6596     {
6597         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6598                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6599                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6600     }
6601     if (cr->npmenodes > dd->nnodes)
6602     {
6603         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6604                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6605     }
6606     if (cr->npmenodes > 0)
6607     {
6608         comm->npmenodes = cr->npmenodes;
6609     }
6610     else
6611     {
6612         comm->npmenodes = dd->nnodes;
6613     }
6614
6615     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6616     {
6617         /* The following choices should match those
6618          * in comm_cost_est in domdec_setup.c.
6619          * Note that here the checks have to take into account
6620          * that the decomposition might occur in a different order than xyz
6621          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6622          * in which case they will not match those in comm_cost_est,
6623          * but since that is mainly for testing purposes that's fine.
6624          */
6625         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6626             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6627             getenv("GMX_PMEONEDD") == NULL)
6628         {
6629             comm->npmedecompdim = 2;
6630             comm->npmenodes_x   = dd->nc[XX];
6631             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6632         }
6633         else
6634         {
6635             /* In case nc is 1 in both x and y we could still choose to
6636              * decompose pme in y instead of x, but we use x for simplicity.
6637              */
6638             comm->npmedecompdim = 1;
6639             if (dd->dim[0] == YY)
6640             {
6641                 comm->npmenodes_x = 1;
6642                 comm->npmenodes_y = comm->npmenodes;
6643             }
6644             else
6645             {
6646                 comm->npmenodes_x = comm->npmenodes;
6647                 comm->npmenodes_y = 1;
6648             }
6649         }
6650         if (fplog)
6651         {
6652             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6653                     comm->npmenodes_x, comm->npmenodes_y, 1);
6654         }
6655     }
6656     else
6657     {
6658         comm->npmedecompdim = 0;
6659         comm->npmenodes_x   = 0;
6660         comm->npmenodes_y   = 0;
6661     }
6662
6663     /* Technically we don't need both of these,
6664      * but it simplifies code not having to recalculate it.
6665      */
6666     *npme_x = comm->npmenodes_x;
6667     *npme_y = comm->npmenodes_y;
6668
6669     snew(comm->slb_frac, DIM);
6670     if (comm->dlbState == edlbsOffForever)
6671     {
6672         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6673         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6674         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6675     }
6676
6677     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6678     {
6679         if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6680         {
6681             /* Set the bonded communication distance to halfway
6682              * the minimum and the maximum,
6683              * since the extra communication cost is nearly zero.
6684              */
6685             real acs           = average_cellsize_min(dd, ddbox);
6686             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6687             if (comm->dlbState != edlbsOffForever)
6688             {
6689                 /* Check if this does not limit the scaling */
6690                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6691             }
6692             if (!comm->bBondComm)
6693             {
6694                 /* Without bBondComm do not go beyond the n.b. cut-off */
6695                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6696                 if (comm->cellsize_limit >= comm->cutoff)
6697                 {
6698                     /* We don't loose a lot of efficieny
6699                      * when increasing it to the n.b. cut-off.
6700                      * It can even be slightly faster, because we need
6701                      * less checks for the communication setup.
6702                      */
6703                     comm->cutoff_mbody = comm->cutoff;
6704                 }
6705             }
6706             /* Check if we did not end up below our original limit */
6707             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6708
6709             if (comm->cutoff_mbody > comm->cellsize_limit)
6710             {
6711                 comm->cellsize_limit = comm->cutoff_mbody;
6712             }
6713         }
6714         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6715     }
6716
6717     if (debug)
6718     {
6719         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6720                 "cellsize limit %f\n",
6721                 comm->bBondComm, comm->cellsize_limit);
6722     }
6723
6724     if (MASTER(cr))
6725     {
6726         check_dd_restrictions(cr, dd, ir, fplog);
6727     }
6728 }
6729
6730 static void set_dlb_limits(gmx_domdec_t *dd)
6731
6732 {
6733     int d;
6734
6735     for (d = 0; d < dd->ndim; d++)
6736     {
6737         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6738         dd->comm->cellsize_min[dd->dim[d]] =
6739             dd->comm->cellsize_min_dlb[dd->dim[d]];
6740     }
6741 }
6742
6743
6744 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6745 {
6746     gmx_domdec_t      *dd;
6747     gmx_domdec_comm_t *comm;
6748     real               cellsize_min;
6749     int                d, nc, i;
6750
6751     dd   = cr->dd;
6752     comm = dd->comm;
6753
6754     cellsize_min = comm->cellsize_min[dd->dim[0]];
6755     for (d = 1; d < dd->ndim; d++)
6756     {
6757         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6758     }
6759
6760     if (cellsize_min < comm->cellsize_limit*1.05)
6761     {
6762         char buf[STRLEN];
6763         sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6764
6765         /* Change DLB from "auto" to "no". */
6766         comm->dlbState = edlbsOffForever;
6767
6768         return;
6769     }
6770
6771     char buf[STRLEN];
6772     sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6773     dd_warning(cr, fplog, buf);
6774     comm->dlbState = edlbsOnCanTurnOff;
6775
6776     /* Store the non-DLB performance, so we can check if DLB actually
6777      * improves performance.
6778      */
6779     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6780     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6781
6782     set_dlb_limits(dd);
6783
6784     /* We can set the required cell size info here,
6785      * so we do not need to communicate this.
6786      * The grid is completely uniform.
6787      */
6788     for (d = 0; d < dd->ndim; d++)
6789     {
6790         if (comm->root[d])
6791         {
6792             comm->load[d].sum_m = comm->load[d].sum;
6793
6794             nc = dd->nc[dd->dim[d]];
6795             for (i = 0; i < nc; i++)
6796             {
6797                 comm->root[d]->cell_f[i]    = i/(real)nc;
6798                 if (d > 0)
6799                 {
6800                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6801                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6802                 }
6803             }
6804             comm->root[d]->cell_f[nc] = 1.0;
6805         }
6806     }
6807 }
6808
6809 static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6810 {
6811     gmx_domdec_t *dd = cr->dd;
6812
6813     char          buf[STRLEN];
6814     sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6815     dd_warning(cr, fplog, buf);
6816     dd->comm->dlbState                     = edlbsOffCanTurnOn;
6817     dd->comm->haveTurnedOffDlb             = true;
6818     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6819 }
6820
6821 static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6822 {
6823     GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6824     char buf[STRLEN];
6825     sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6826     dd_warning(cr, fplog, buf);
6827     cr->dd->comm->dlbState = edlbsOffForever;
6828 }
6829
6830 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6831 {
6832     int   ncg, cg;
6833     char *bLocalCG;
6834
6835     ncg = ncg_mtop(mtop);
6836     snew(bLocalCG, ncg);
6837     for (cg = 0; cg < ncg; cg++)
6838     {
6839         bLocalCG[cg] = FALSE;
6840     }
6841
6842     return bLocalCG;
6843 }
6844
6845 void dd_init_bondeds(FILE *fplog,
6846                      gmx_domdec_t *dd,
6847                      const gmx_mtop_t *mtop,
6848                      const gmx_vsite_t *vsite,
6849                      const t_inputrec *ir,
6850                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6851 {
6852     gmx_domdec_comm_t *comm;
6853
6854     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6855
6856     comm = dd->comm;
6857
6858     if (comm->bBondComm)
6859     {
6860         /* Communicate atoms beyond the cut-off for bonded interactions */
6861         comm = dd->comm;
6862
6863         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6864
6865         comm->bLocalCG = init_bLocalCG(mtop);
6866     }
6867     else
6868     {
6869         /* Only communicate atoms based on cut-off */
6870         comm->cglink   = NULL;
6871         comm->bLocalCG = NULL;
6872     }
6873 }
6874
6875 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6876                               const gmx_mtop_t *mtop, const t_inputrec *ir,
6877                               gmx_bool bDynLoadBal, real dlb_scale,
6878                               const gmx_ddbox_t *ddbox)
6879 {
6880     gmx_domdec_comm_t *comm;
6881     int                d;
6882     ivec               np;
6883     real               limit, shrink;
6884     char               buf[64];
6885
6886     if (fplog == NULL)
6887     {
6888         return;
6889     }
6890
6891     comm = dd->comm;
6892
6893     if (bDynLoadBal)
6894     {
6895         fprintf(fplog, "The maximum number of communication pulses is:");
6896         for (d = 0; d < dd->ndim; d++)
6897         {
6898             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6899         }
6900         fprintf(fplog, "\n");
6901         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6902         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6903         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6904         for (d = 0; d < DIM; d++)
6905         {
6906             if (dd->nc[d] > 1)
6907             {
6908                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6909                 {
6910                     shrink = 0;
6911                 }
6912                 else
6913                 {
6914                     shrink =
6915                         comm->cellsize_min_dlb[d]/
6916                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6917                 }
6918                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6919             }
6920         }
6921         fprintf(fplog, "\n");
6922     }
6923     else
6924     {
6925         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6926         fprintf(fplog, "The initial number of communication pulses is:");
6927         for (d = 0; d < dd->ndim; d++)
6928         {
6929             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6930         }
6931         fprintf(fplog, "\n");
6932         fprintf(fplog, "The initial domain decomposition cell size is:");
6933         for (d = 0; d < DIM; d++)
6934         {
6935             if (dd->nc[d] > 1)
6936             {
6937                 fprintf(fplog, " %c %.2f nm",
6938                         dim2char(d), dd->comm->cellsize_min[d]);
6939             }
6940         }
6941         fprintf(fplog, "\n\n");
6942     }
6943
6944     gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6945
6946     if (comm->bInterCGBondeds ||
6947         bInterCGVsites ||
6948         dd->bInterCGcons || dd->bInterCGsettles)
6949     {
6950         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6951         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6952                 "non-bonded interactions", "", comm->cutoff);
6953
6954         if (bDynLoadBal)
6955         {
6956             limit = dd->comm->cellsize_limit;
6957         }
6958         else
6959         {
6960             if (dynamic_dd_box(ddbox, ir))
6961             {
6962                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6963             }
6964             limit = dd->comm->cellsize_min[XX];
6965             for (d = 1; d < DIM; d++)
6966             {
6967                 limit = std::min(limit, dd->comm->cellsize_min[d]);
6968             }
6969         }
6970
6971         if (comm->bInterCGBondeds)
6972         {
6973             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6974                     "two-body bonded interactions", "(-rdd)",
6975                     std::max(comm->cutoff, comm->cutoff_mbody));
6976             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6977                     "multi-body bonded interactions", "(-rdd)",
6978                     (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6979         }
6980         if (bInterCGVsites)
6981         {
6982             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6983                     "virtual site constructions", "(-rcon)", limit);
6984         }
6985         if (dd->bInterCGcons || dd->bInterCGsettles)
6986         {
6987             sprintf(buf, "atoms separated by up to %d constraints",
6988                     1+ir->nProjOrder);
6989             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
6990                     buf, "(-rcon)", limit);
6991         }
6992         fprintf(fplog, "\n");
6993     }
6994
6995     fflush(fplog);
6996 }
6997
6998 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
6999                                 real               dlb_scale,
7000                                 const t_inputrec  *ir,
7001                                 const gmx_ddbox_t *ddbox)
7002 {
7003     gmx_domdec_comm_t *comm;
7004     int                d, dim, npulse, npulse_d_max, npulse_d;
7005     gmx_bool           bNoCutOff;
7006
7007     comm = dd->comm;
7008
7009     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7010
7011     /* Determine the maximum number of comm. pulses in one dimension */
7012
7013     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7014
7015     /* Determine the maximum required number of grid pulses */
7016     if (comm->cellsize_limit >= comm->cutoff)
7017     {
7018         /* Only a single pulse is required */
7019         npulse = 1;
7020     }
7021     else if (!bNoCutOff && comm->cellsize_limit > 0)
7022     {
7023         /* We round down slightly here to avoid overhead due to the latency
7024          * of extra communication calls when the cut-off
7025          * would be only slightly longer than the cell size.
7026          * Later cellsize_limit is redetermined,
7027          * so we can not miss interactions due to this rounding.
7028          */
7029         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7030     }
7031     else
7032     {
7033         /* There is no cell size limit */
7034         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7035     }
7036
7037     if (!bNoCutOff && npulse > 1)
7038     {
7039         /* See if we can do with less pulses, based on dlb_scale */
7040         npulse_d_max = 0;
7041         for (d = 0; d < dd->ndim; d++)
7042         {
7043             dim      = dd->dim[d];
7044             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7045                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7046             npulse_d_max = std::max(npulse_d_max, npulse_d);
7047         }
7048         npulse = std::min(npulse, npulse_d_max);
7049     }
7050
7051     /* This env var can override npulse */
7052     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7053     if (d > 0)
7054     {
7055         npulse = d;
7056     }
7057
7058     comm->maxpulse       = 1;
7059     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7060     for (d = 0; d < dd->ndim; d++)
7061     {
7062         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7063         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7064         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7065         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7066         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7067         {
7068             comm->bVacDLBNoLimit = FALSE;
7069         }
7070     }
7071
7072     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7073     if (!comm->bVacDLBNoLimit)
7074     {
7075         comm->cellsize_limit = std::max(comm->cellsize_limit,
7076                                         comm->cutoff/comm->maxpulse);
7077     }
7078     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7079     /* Set the minimum cell size for each DD dimension */
7080     for (d = 0; d < dd->ndim; d++)
7081     {
7082         if (comm->bVacDLBNoLimit ||
7083             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7084         {
7085             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7086         }
7087         else
7088         {
7089             comm->cellsize_min_dlb[dd->dim[d]] =
7090                 comm->cutoff/comm->cd[d].np_dlb;
7091         }
7092     }
7093     if (comm->cutoff_mbody <= 0)
7094     {
7095         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7096     }
7097     if (dlbIsOn(comm))
7098     {
7099         set_dlb_limits(dd);
7100     }
7101 }
7102
7103 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7104 {
7105     /* If each molecule is a single charge group
7106      * or we use domain decomposition for each periodic dimension,
7107      * we do not need to take pbc into account for the bonded interactions.
7108      */
7109     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7110             !(dd->nc[XX] > 1 &&
7111               dd->nc[YY] > 1 &&
7112               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7113 }
7114
7115 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7116 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7117                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
7118                                   const gmx_ddbox_t *ddbox)
7119 {
7120     gmx_domdec_comm_t *comm;
7121     int                natoms_tot;
7122     real               vol_frac;
7123
7124     comm = dd->comm;
7125
7126     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7127     {
7128         init_ddpme(dd, &comm->ddpme[0], 0);
7129         if (comm->npmedecompdim >= 2)
7130         {
7131             init_ddpme(dd, &comm->ddpme[1], 1);
7132         }
7133     }
7134     else
7135     {
7136         comm->npmenodes = 0;
7137         if (dd->pme_nodeid >= 0)
7138         {
7139             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7140                                  "Can not have separate PME ranks without PME electrostatics");
7141         }
7142     }
7143
7144     if (debug)
7145     {
7146         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7147     }
7148     if (comm->dlbState != edlbsOffForever)
7149     {
7150         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7151     }
7152
7153     print_dd_settings(fplog, dd, mtop, ir, dlbIsOn(comm), dlb_scale, ddbox);
7154     if (comm->dlbState == edlbsOffCanTurnOn)
7155     {
7156         if (fplog)
7157         {
7158             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7159         }
7160         print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7161     }
7162
7163     if (ir->ePBC == epbcNONE)
7164     {
7165         vol_frac = 1 - 1/(double)dd->nnodes;
7166     }
7167     else
7168     {
7169         vol_frac =
7170             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7171     }
7172     if (debug)
7173     {
7174         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7175     }
7176     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7177
7178     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7179 }
7180
7181 /*! \brief Set some important DD parameters that can be modified by env.vars */
7182 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7183 {
7184     gmx_domdec_comm_t *comm = dd->comm;
7185
7186     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7187     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7188     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7189     int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7190     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7191     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7192     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7193
7194     if (dd->bSendRecv2 && fplog)
7195     {
7196         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7197     }
7198
7199     if (comm->eFlop)
7200     {
7201         if (fplog)
7202         {
7203             fprintf(fplog, "Will load balance based on FLOP count\n");
7204         }
7205         if (comm->eFlop > 1)
7206         {
7207             srand(1 + rank_mysim);
7208         }
7209         comm->bRecordLoad = TRUE;
7210     }
7211     else
7212     {
7213         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7214     }
7215 }
7216
7217 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7218                                         unsigned long Flags,
7219                                         ivec nc, int nPmeRanks,
7220                                         int dd_rank_order,
7221                                         real comm_distance_min, real rconstr,
7222                                         const char *dlb_opt, real dlb_scale,
7223                                         const char *sizex, const char *sizey, const char *sizez,
7224                                         const gmx_mtop_t *mtop,
7225                                         const t_inputrec *ir,
7226                                         matrix box, rvec *x,
7227                                         gmx_ddbox_t *ddbox,
7228                                         int *npme_x, int *npme_y)
7229 {
7230     gmx_domdec_t      *dd;
7231
7232     if (fplog)
7233     {
7234         fprintf(fplog,
7235                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7236     }
7237
7238     snew(dd, 1);
7239
7240     dd->comm = init_dd_comm();
7241
7242     set_dd_envvar_options(fplog, dd, cr->nodeid);
7243
7244     set_dd_limits_and_grid(fplog, cr, dd, Flags,
7245                            nc, nPmeRanks,
7246                            comm_distance_min, rconstr,
7247                            dlb_opt, dlb_scale,
7248                            sizex, sizey, sizez,
7249                            mtop, ir,
7250                            box, x,
7251                            ddbox,
7252                            npme_x, npme_y);
7253
7254     make_dd_communicators(fplog, cr, dd, dd_rank_order);
7255
7256     if (cr->duty & DUTY_PP)
7257     {
7258         set_ddgrid_parameters(fplog, dd, dlb_scale, mtop, ir, ddbox);
7259
7260         setup_neighbor_relations(dd);
7261     }
7262
7263     /* Set overallocation to avoid frequent reallocation of arrays */
7264     set_over_alloc_dd(TRUE);
7265
7266     /* Initialize DD paritioning counters */
7267     dd->comm->partition_step = INT_MIN;
7268     dd->ddp_count            = 0;
7269
7270     /* We currently don't know the number of threads yet, we set this later */
7271     dd->comm->nth = 0;
7272
7273     clear_dd_cycle_counts(dd);
7274
7275     return dd;
7276 }
7277
7278 static gmx_bool test_dd_cutoff(t_commrec *cr,
7279                                t_state *state, const t_inputrec *ir,
7280                                real cutoff_req)
7281 {
7282     gmx_domdec_t *dd;
7283     gmx_ddbox_t   ddbox;
7284     int           d, dim, np;
7285     real          inv_cell_size;
7286     int           LocallyLimited;
7287
7288     dd = cr->dd;
7289
7290     set_ddbox(dd, FALSE, cr, ir, state->box,
7291               TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7292
7293     LocallyLimited = 0;
7294
7295     for (d = 0; d < dd->ndim; d++)
7296     {
7297         dim = dd->dim[d];
7298
7299         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7300         if (dynamic_dd_box(&ddbox, ir))
7301         {
7302             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7303         }
7304
7305         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7306
7307         if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7308             dd->comm->cd[d].np_dlb > 0)
7309         {
7310             if (np > dd->comm->cd[d].np_dlb)
7311             {
7312                 return FALSE;
7313             }
7314
7315             /* If a current local cell size is smaller than the requested
7316              * cut-off, we could still fix it, but this gets very complicated.
7317              * Without fixing here, we might actually need more checks.
7318              */
7319             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7320             {
7321                 LocallyLimited = 1;
7322             }
7323         }
7324     }
7325
7326     if (dd->comm->dlbState != edlbsOffForever)
7327     {
7328         /* If DLB is not active yet, we don't need to check the grid jumps.
7329          * Actually we shouldn't, because then the grid jump data is not set.
7330          */
7331         if (dlbIsOn(dd->comm) &&
7332             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7333         {
7334             LocallyLimited = 1;
7335         }
7336
7337         gmx_sumi(1, &LocallyLimited, cr);
7338
7339         if (LocallyLimited > 0)
7340         {
7341             return FALSE;
7342         }
7343     }
7344
7345     return TRUE;
7346 }
7347
7348 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7349                           real cutoff_req)
7350 {
7351     gmx_bool bCutoffAllowed;
7352
7353     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7354
7355     if (bCutoffAllowed)
7356     {
7357         cr->dd->comm->cutoff = cutoff_req;
7358     }
7359
7360     return bCutoffAllowed;
7361 }
7362
7363 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7364 {
7365     gmx_domdec_comm_t *comm;
7366
7367     comm = cr->dd->comm;
7368
7369     /* Turn on the DLB limiting (might have been on already) */
7370     comm->bPMELoadBalDLBLimits = TRUE;
7371
7372     /* Change the cut-off limit */
7373     comm->PMELoadBal_max_cutoff = cutoff;
7374
7375     if (debug)
7376     {
7377         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7378                 comm->PMELoadBal_max_cutoff);
7379     }
7380 }
7381
7382 /* Sets whether we should later check the load imbalance data, so that
7383  * we can trigger dynamic load balancing if enough imbalance has
7384  * arisen.
7385  *
7386  * Used after PME load balancing unlocks DLB, so that the check
7387  * whether DLB will be useful can happen immediately.
7388  */
7389 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7390 {
7391     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7392     {
7393         dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7394
7395         if (bValue == TRUE)
7396         {
7397             /* Store the DD partitioning count, so we can ignore cycle counts
7398              * over the next nstlist steps, which are often slower.
7399              */
7400             dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7401         }
7402     }
7403 }
7404
7405 /* Returns if we should check whether there has been enough load
7406  * imbalance to trigger dynamic load balancing.
7407  */
7408 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7409 {
7410     if (dd->comm->dlbState != edlbsOffCanTurnOn)
7411     {
7412         return FALSE;
7413     }
7414
7415     if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7416     {
7417         /* We ignore the first nstlist steps at the start of the run
7418          * or after PME load balancing or after turning DLB off, since
7419          * these often have extra allocation or cache miss overhead.
7420          */
7421         return FALSE;
7422     }
7423
7424     /* We should check whether we should use DLB directly after
7425      * unlocking DLB. */
7426     if (dd->comm->bCheckWhetherToTurnDlbOn)
7427     {
7428         /* This flag was set when the PME load-balancing routines
7429            unlocked DLB, and should now be cleared. */
7430         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7431         return TRUE;
7432     }
7433     /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7434      * partitionings (we do not do this every partioning, so that we
7435      * avoid excessive communication). */
7436     if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7437     {
7438         return TRUE;
7439     }
7440
7441     return FALSE;
7442 }
7443
7444 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7445 {
7446     return dlbIsOn(dd->comm);
7447 }
7448
7449 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7450 {
7451     return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7452 }
7453
7454 void dd_dlb_lock(gmx_domdec_t *dd)
7455 {
7456     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7457     if (dd->comm->dlbState == edlbsOffCanTurnOn)
7458     {
7459         dd->comm->dlbState = edlbsOffTemporarilyLocked;
7460     }
7461 }
7462
7463 void dd_dlb_unlock(gmx_domdec_t *dd)
7464 {
7465     /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7466     if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7467     {
7468         dd->comm->dlbState = edlbsOffCanTurnOn;
7469         dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7470     }
7471 }
7472
7473 static void merge_cg_buffers(int ncell,
7474                              gmx_domdec_comm_dim_t *cd, int pulse,
7475                              int  *ncg_cell,
7476                              int  *index_gl, int  *recv_i,
7477                              rvec *cg_cm,    rvec *recv_vr,
7478                              int *cgindex,
7479                              cginfo_mb_t *cginfo_mb, int *cginfo)
7480 {
7481     gmx_domdec_ind_t *ind, *ind_p;
7482     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7483     int               shift, shift_at;
7484
7485     ind = &cd->ind[pulse];
7486
7487     /* First correct the already stored data */
7488     shift = ind->nrecv[ncell];
7489     for (cell = ncell-1; cell >= 0; cell--)
7490     {
7491         shift -= ind->nrecv[cell];
7492         if (shift > 0)
7493         {
7494             /* Move the cg's present from previous grid pulses */
7495             cg0                = ncg_cell[ncell+cell];
7496             cg1                = ncg_cell[ncell+cell+1];
7497             cgindex[cg1+shift] = cgindex[cg1];
7498             for (cg = cg1-1; cg >= cg0; cg--)
7499             {
7500                 index_gl[cg+shift] = index_gl[cg];
7501                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7502                 cgindex[cg+shift] = cgindex[cg];
7503                 cginfo[cg+shift]  = cginfo[cg];
7504             }
7505             /* Correct the already stored send indices for the shift */
7506             for (p = 1; p <= pulse; p++)
7507             {
7508                 ind_p = &cd->ind[p];
7509                 cg0   = 0;
7510                 for (c = 0; c < cell; c++)
7511                 {
7512                     cg0 += ind_p->nsend[c];
7513                 }
7514                 cg1 = cg0 + ind_p->nsend[cell];
7515                 for (cg = cg0; cg < cg1; cg++)
7516                 {
7517                     ind_p->index[cg] += shift;
7518                 }
7519             }
7520         }
7521     }
7522
7523     /* Merge in the communicated buffers */
7524     shift    = 0;
7525     shift_at = 0;
7526     cg0      = 0;
7527     for (cell = 0; cell < ncell; cell++)
7528     {
7529         cg1 = ncg_cell[ncell+cell+1] + shift;
7530         if (shift_at > 0)
7531         {
7532             /* Correct the old cg indices */
7533             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7534             {
7535                 cgindex[cg+1] += shift_at;
7536             }
7537         }
7538         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7539         {
7540             /* Copy this charge group from the buffer */
7541             index_gl[cg1] = recv_i[cg0];
7542             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7543             /* Add it to the cgindex */
7544             cg_gl          = index_gl[cg1];
7545             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7546             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7547             cgindex[cg1+1] = cgindex[cg1] + nat;
7548             cg0++;
7549             cg1++;
7550             shift_at += nat;
7551         }
7552         shift                 += ind->nrecv[cell];
7553         ncg_cell[ncell+cell+1] = cg1;
7554     }
7555 }
7556
7557 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7558                                int nzone, int cg0, const int *cgindex)
7559 {
7560     int cg, zone, p;
7561
7562     /* Store the atom block boundaries for easy copying of communication buffers
7563      */
7564     cg = cg0;
7565     for (zone = 0; zone < nzone; zone++)
7566     {
7567         for (p = 0; p < cd->np; p++)
7568         {
7569             cd->ind[p].cell2at0[zone] = cgindex[cg];
7570             cg += cd->ind[p].nrecv[zone];
7571             cd->ind[p].cell2at1[zone] = cgindex[cg];
7572         }
7573     }
7574 }
7575
7576 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7577 {
7578     int      i;
7579     gmx_bool bMiss;
7580
7581     bMiss = FALSE;
7582     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7583     {
7584         if (!bLocalCG[link->a[i]])
7585         {
7586             bMiss = TRUE;
7587         }
7588     }
7589
7590     return bMiss;
7591 }
7592
7593 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7594 typedef struct {
7595     real c[DIM][4]; /* the corners for the non-bonded communication */
7596     real cr0;       /* corner for rounding */
7597     real cr1[4];    /* corners for rounding */
7598     real bc[DIM];   /* corners for bounded communication */
7599     real bcr1;      /* corner for rounding for bonded communication */
7600 } dd_corners_t;
7601
7602 /* Determine the corners of the domain(s) we are communicating with */
7603 static void
7604 set_dd_corners(const gmx_domdec_t *dd,
7605                int dim0, int dim1, int dim2,
7606                gmx_bool bDistMB,
7607                dd_corners_t *c)
7608 {
7609     const gmx_domdec_comm_t  *comm;
7610     const gmx_domdec_zones_t *zones;
7611     int i, j;
7612
7613     comm = dd->comm;
7614
7615     zones = &comm->zones;
7616
7617     /* Keep the compiler happy */
7618     c->cr0  = 0;
7619     c->bcr1 = 0;
7620
7621     /* The first dimension is equal for all cells */
7622     c->c[0][0] = comm->cell_x0[dim0];
7623     if (bDistMB)
7624     {
7625         c->bc[0] = c->c[0][0];
7626     }
7627     if (dd->ndim >= 2)
7628     {
7629         dim1 = dd->dim[1];
7630         /* This cell row is only seen from the first row */
7631         c->c[1][0] = comm->cell_x0[dim1];
7632         /* All rows can see this row */
7633         c->c[1][1] = comm->cell_x0[dim1];
7634         if (dlbIsOn(dd->comm))
7635         {
7636             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7637             if (bDistMB)
7638             {
7639                 /* For the multi-body distance we need the maximum */
7640                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7641             }
7642         }
7643         /* Set the upper-right corner for rounding */
7644         c->cr0 = comm->cell_x1[dim0];
7645
7646         if (dd->ndim >= 3)
7647         {
7648             dim2 = dd->dim[2];
7649             for (j = 0; j < 4; j++)
7650             {
7651                 c->c[2][j] = comm->cell_x0[dim2];
7652             }
7653             if (dlbIsOn(dd->comm))
7654             {
7655                 /* Use the maximum of the i-cells that see a j-cell */
7656                 for (i = 0; i < zones->nizone; i++)
7657                 {
7658                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7659                     {
7660                         if (j >= 4)
7661                         {
7662                             c->c[2][j-4] =
7663                                 std::max(c->c[2][j-4],
7664                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7665                         }
7666                     }
7667                 }
7668                 if (bDistMB)
7669                 {
7670                     /* For the multi-body distance we need the maximum */
7671                     c->bc[2] = comm->cell_x0[dim2];
7672                     for (i = 0; i < 2; i++)
7673                     {
7674                         for (j = 0; j < 2; j++)
7675                         {
7676                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7677                         }
7678                     }
7679                 }
7680             }
7681
7682             /* Set the upper-right corner for rounding */
7683             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7684              * Only cell (0,0,0) can see cell 7 (1,1,1)
7685              */
7686             c->cr1[0] = comm->cell_x1[dim1];
7687             c->cr1[3] = comm->cell_x1[dim1];
7688             if (dlbIsOn(dd->comm))
7689             {
7690                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7691                 if (bDistMB)
7692                 {
7693                     /* For the multi-body distance we need the maximum */
7694                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7695                 }
7696             }
7697         }
7698     }
7699 }
7700
7701 /* Determine which cg's we need to send in this pulse from this zone */
7702 static void
7703 get_zone_pulse_cgs(gmx_domdec_t *dd,
7704                    int zonei, int zone,
7705                    int cg0, int cg1,
7706                    const int *index_gl,
7707                    const int *cgindex,
7708                    int dim, int dim_ind,
7709                    int dim0, int dim1, int dim2,
7710                    real r_comm2, real r_bcomm2,
7711                    matrix box,
7712                    ivec tric_dist,
7713                    rvec *normal,
7714                    real skew_fac2_d, real skew_fac_01,
7715                    rvec *v_d, rvec *v_0, rvec *v_1,
7716                    const dd_corners_t *c,
7717                    rvec sf2_round,
7718                    gmx_bool bDistBonded,
7719                    gmx_bool bBondComm,
7720                    gmx_bool bDist2B,
7721                    gmx_bool bDistMB,
7722                    rvec *cg_cm,
7723                    int *cginfo,
7724                    gmx_domdec_ind_t *ind,
7725                    int **ibuf, int *ibuf_nalloc,
7726                    vec_rvec_t *vbuf,
7727                    int *nsend_ptr,
7728                    int *nat_ptr,
7729                    int *nsend_z_ptr)
7730 {
7731     gmx_domdec_comm_t *comm;
7732     gmx_bool           bScrew;
7733     gmx_bool           bDistMB_pulse;
7734     int                cg, i;
7735     real               r2, rb2, r, tric_sh;
7736     rvec               rn, rb;
7737     int                dimd;
7738     int                nsend_z, nsend, nat;
7739
7740     comm = dd->comm;
7741
7742     bScrew = (dd->bScrewPBC && dim == XX);
7743
7744     bDistMB_pulse = (bDistMB && bDistBonded);
7745
7746     nsend_z = 0;
7747     nsend   = *nsend_ptr;
7748     nat     = *nat_ptr;
7749
7750     for (cg = cg0; cg < cg1; cg++)
7751     {
7752         r2  = 0;
7753         rb2 = 0;
7754         if (tric_dist[dim_ind] == 0)
7755         {
7756             /* Rectangular direction, easy */
7757             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7758             if (r > 0)
7759             {
7760                 r2 += r*r;
7761             }
7762             if (bDistMB_pulse)
7763             {
7764                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7765                 if (r > 0)
7766                 {
7767                     rb2 += r*r;
7768                 }
7769             }
7770             /* Rounding gives at most a 16% reduction
7771              * in communicated atoms
7772              */
7773             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7774             {
7775                 r = cg_cm[cg][dim0] - c->cr0;
7776                 /* This is the first dimension, so always r >= 0 */
7777                 r2 += r*r;
7778                 if (bDistMB_pulse)
7779                 {
7780                     rb2 += r*r;
7781                 }
7782             }
7783             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7784             {
7785                 r = cg_cm[cg][dim1] - c->cr1[zone];
7786                 if (r > 0)
7787                 {
7788                     r2 += r*r;
7789                 }
7790                 if (bDistMB_pulse)
7791                 {
7792                     r = cg_cm[cg][dim1] - c->bcr1;
7793                     if (r > 0)
7794                     {
7795                         rb2 += r*r;
7796                     }
7797                 }
7798             }
7799         }
7800         else
7801         {
7802             /* Triclinic direction, more complicated */
7803             clear_rvec(rn);
7804             clear_rvec(rb);
7805             /* Rounding, conservative as the skew_fac multiplication
7806              * will slightly underestimate the distance.
7807              */
7808             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7809             {
7810                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7811                 for (i = dim0+1; i < DIM; i++)
7812                 {
7813                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7814                 }
7815                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7816                 if (bDistMB_pulse)
7817                 {
7818                     rb[dim0] = rn[dim0];
7819                     rb2      = r2;
7820                 }
7821                 /* Take care that the cell planes along dim0 might not
7822                  * be orthogonal to those along dim1 and dim2.
7823                  */
7824                 for (i = 1; i <= dim_ind; i++)
7825                 {
7826                     dimd = dd->dim[i];
7827                     if (normal[dim0][dimd] > 0)
7828                     {
7829                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7830                         if (bDistMB_pulse)
7831                         {
7832                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7833                         }
7834                     }
7835                 }
7836             }
7837             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7838             {
7839                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7840                 tric_sh   = 0;
7841                 for (i = dim1+1; i < DIM; i++)
7842                 {
7843                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7844                 }
7845                 rn[dim1] += tric_sh;
7846                 if (rn[dim1] > 0)
7847                 {
7848                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7849                     /* Take care of coupling of the distances
7850                      * to the planes along dim0 and dim1 through dim2.
7851                      */
7852                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7853                     /* Take care that the cell planes along dim1
7854                      * might not be orthogonal to that along dim2.
7855                      */
7856                     if (normal[dim1][dim2] > 0)
7857                     {
7858                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7859                     }
7860                 }
7861                 if (bDistMB_pulse)
7862                 {
7863                     rb[dim1] +=
7864                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7865                     if (rb[dim1] > 0)
7866                     {
7867                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7868                         /* Take care of coupling of the distances
7869                          * to the planes along dim0 and dim1 through dim2.
7870                          */
7871                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7872                         /* Take care that the cell planes along dim1
7873                          * might not be orthogonal to that along dim2.
7874                          */
7875                         if (normal[dim1][dim2] > 0)
7876                         {
7877                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7878                         }
7879                     }
7880                 }
7881             }
7882             /* The distance along the communication direction */
7883             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7884             tric_sh  = 0;
7885             for (i = dim+1; i < DIM; i++)
7886             {
7887                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7888             }
7889             rn[dim] += tric_sh;
7890             if (rn[dim] > 0)
7891             {
7892                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7893                 /* Take care of coupling of the distances
7894                  * to the planes along dim0 and dim1 through dim2.
7895                  */
7896                 if (dim_ind == 1 && zonei == 1)
7897                 {
7898                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7899                 }
7900             }
7901             if (bDistMB_pulse)
7902             {
7903                 clear_rvec(rb);
7904                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7905                 if (rb[dim] > 0)
7906                 {
7907                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7908                     /* Take care of coupling of the distances
7909                      * to the planes along dim0 and dim1 through dim2.
7910                      */
7911                     if (dim_ind == 1 && zonei == 1)
7912                     {
7913                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7914                     }
7915                 }
7916             }
7917         }
7918
7919         if (r2 < r_comm2 ||
7920             (bDistBonded &&
7921              ((bDistMB && rb2 < r_bcomm2) ||
7922               (bDist2B && r2  < r_bcomm2)) &&
7923              (!bBondComm ||
7924               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7925                missing_link(comm->cglink, index_gl[cg],
7926                             comm->bLocalCG)))))
7927         {
7928             /* Make an index to the local charge groups */
7929             if (nsend+1 > ind->nalloc)
7930             {
7931                 ind->nalloc = over_alloc_large(nsend+1);
7932                 srenew(ind->index, ind->nalloc);
7933             }
7934             if (nsend+1 > *ibuf_nalloc)
7935             {
7936                 *ibuf_nalloc = over_alloc_large(nsend+1);
7937                 srenew(*ibuf, *ibuf_nalloc);
7938             }
7939             ind->index[nsend] = cg;
7940             (*ibuf)[nsend]    = index_gl[cg];
7941             nsend_z++;
7942             vec_rvec_check_alloc(vbuf, nsend+1);
7943
7944             if (dd->ci[dim] == 0)
7945             {
7946                 /* Correct cg_cm for pbc */
7947                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7948                 if (bScrew)
7949                 {
7950                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7951                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7952                 }
7953             }
7954             else
7955             {
7956                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7957             }
7958             nsend++;
7959             nat += cgindex[cg+1] - cgindex[cg];
7960         }
7961     }
7962
7963     *nsend_ptr   = nsend;
7964     *nat_ptr     = nat;
7965     *nsend_z_ptr = nsend_z;
7966 }
7967
7968 static void setup_dd_communication(gmx_domdec_t *dd,
7969                                    matrix box, gmx_ddbox_t *ddbox,
7970                                    t_forcerec *fr,
7971                                    t_state *state, PaddedRVecVector *f)
7972 {
7973     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7974     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7975     int                    c, i, cg, cg_gl, nrcg;
7976     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7977     gmx_domdec_comm_t     *comm;
7978     gmx_domdec_zones_t    *zones;
7979     gmx_domdec_comm_dim_t *cd;
7980     gmx_domdec_ind_t      *ind;
7981     cginfo_mb_t           *cginfo_mb;
7982     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7983     real                   r_comm2, r_bcomm2;
7984     dd_corners_t           corners;
7985     ivec                   tric_dist;
7986     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7987     real                   skew_fac2_d, skew_fac_01;
7988     rvec                   sf2_round;
7989     int                    nsend, nat;
7990     int                    th;
7991
7992     if (debug)
7993     {
7994         fprintf(debug, "Setting up DD communication\n");
7995     }
7996
7997     comm  = dd->comm;
7998
7999     if (comm->nth == 0)
8000     {
8001         /* Initialize the thread data.
8002          * This can not be done in init_domain_decomposition,
8003          * as the numbers of threads is determined later.
8004          */
8005         comm->nth = gmx_omp_nthreads_get(emntDomdec);
8006         if (comm->nth > 1)
8007         {
8008             snew(comm->dth, comm->nth);
8009         }
8010     }
8011
8012     switch (fr->cutoff_scheme)
8013     {
8014         case ecutsGROUP:
8015             cg_cm = fr->cg_cm;
8016             break;
8017         case ecutsVERLET:
8018             cg_cm = as_rvec_array(state->x.data());
8019             break;
8020         default:
8021             gmx_incons("unimplemented");
8022             cg_cm = NULL;
8023     }
8024
8025     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8026     {
8027         /* Check if we need to use triclinic distances */
8028         tric_dist[dim_ind] = 0;
8029         for (i = 0; i <= dim_ind; i++)
8030         {
8031             if (ddbox->tric_dir[dd->dim[i]])
8032             {
8033                 tric_dist[dim_ind] = 1;
8034             }
8035         }
8036     }
8037
8038     bBondComm = comm->bBondComm;
8039
8040     /* Do we need to determine extra distances for multi-body bondeds? */
8041     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8042
8043     /* Do we need to determine extra distances for only two-body bondeds? */
8044     bDist2B = (bBondComm && !bDistMB);
8045
8046     r_comm2  = gmx::square(comm->cutoff);
8047     r_bcomm2 = gmx::square(comm->cutoff_mbody);
8048
8049     if (debug)
8050     {
8051         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
8052     }
8053
8054     zones = &comm->zones;
8055
8056     dim0 = dd->dim[0];
8057     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8058     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8059
8060     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8061
8062     /* Triclinic stuff */
8063     normal      = ddbox->normal;
8064     skew_fac_01 = 0;
8065     if (dd->ndim >= 2)
8066     {
8067         v_0 = ddbox->v[dim0];
8068         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8069         {
8070             /* Determine the coupling coefficient for the distances
8071              * to the cell planes along dim0 and dim1 through dim2.
8072              * This is required for correct rounding.
8073              */
8074             skew_fac_01 =
8075                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8076             if (debug)
8077             {
8078                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8079             }
8080         }
8081     }
8082     if (dd->ndim >= 3)
8083     {
8084         v_1 = ddbox->v[dim1];
8085     }
8086
8087     zone_cg_range = zones->cg_range;
8088     index_gl      = dd->index_gl;
8089     cgindex       = dd->cgindex;
8090     cginfo_mb     = fr->cginfo_mb;
8091
8092     zone_cg_range[0]   = 0;
8093     zone_cg_range[1]   = dd->ncg_home;
8094     comm->zone_ncg1[0] = dd->ncg_home;
8095     pos_cg             = dd->ncg_home;
8096
8097     nat_tot = dd->nat_home;
8098     nzone   = 1;
8099     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8100     {
8101         dim = dd->dim[dim_ind];
8102         cd  = &comm->cd[dim_ind];
8103
8104         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8105         {
8106             /* No pbc in this dimension, the first node should not comm. */
8107             nzone_send = 0;
8108         }
8109         else
8110         {
8111             nzone_send = nzone;
8112         }
8113
8114         v_d         = ddbox->v[dim];
8115         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8116
8117         cd->bInPlace = TRUE;
8118         for (p = 0; p < cd->np; p++)
8119         {
8120             /* Only atoms communicated in the first pulse are used
8121              * for multi-body bonded interactions or for bBondComm.
8122              */
8123             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8124
8125             ind   = &cd->ind[p];
8126             nsend = 0;
8127             nat   = 0;
8128             for (zone = 0; zone < nzone_send; zone++)
8129             {
8130                 if (tric_dist[dim_ind] && dim_ind > 0)
8131                 {
8132                     /* Determine slightly more optimized skew_fac's
8133                      * for rounding.
8134                      * This reduces the number of communicated atoms
8135                      * by about 10% for 3D DD of rhombic dodecahedra.
8136                      */
8137                     for (dimd = 0; dimd < dim; dimd++)
8138                     {
8139                         sf2_round[dimd] = 1;
8140                         if (ddbox->tric_dir[dimd])
8141                         {
8142                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8143                             {
8144                                 /* If we are shifted in dimension i
8145                                  * and the cell plane is tilted forward
8146                                  * in dimension i, skip this coupling.
8147                                  */
8148                                 if (!(zones->shift[nzone+zone][i] &&
8149                                       ddbox->v[dimd][i][dimd] >= 0))
8150                                 {
8151                                     sf2_round[dimd] +=
8152                                         gmx::square(ddbox->v[dimd][i][dimd]);
8153                                 }
8154                             }
8155                             sf2_round[dimd] = 1/sf2_round[dimd];
8156                         }
8157                     }
8158                 }
8159
8160                 zonei = zone_perm[dim_ind][zone];
8161                 if (p == 0)
8162                 {
8163                     /* Here we permutate the zones to obtain a convenient order
8164                      * for neighbor searching
8165                      */
8166                     cg0 = zone_cg_range[zonei];
8167                     cg1 = zone_cg_range[zonei+1];
8168                 }
8169                 else
8170                 {
8171                     /* Look only at the cg's received in the previous grid pulse
8172                      */
8173                     cg1 = zone_cg_range[nzone+zone+1];
8174                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8175                 }
8176
8177 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8178                 for (th = 0; th < comm->nth; th++)
8179                 {
8180                     try
8181                     {
8182                         gmx_domdec_ind_t *ind_p;
8183                         int             **ibuf_p, *ibuf_nalloc_p;
8184                         vec_rvec_t       *vbuf_p;
8185                         int              *nsend_p, *nat_p;
8186                         int              *nsend_zone_p;
8187                         int               cg0_th, cg1_th;
8188
8189                         if (th == 0)
8190                         {
8191                             /* Thread 0 writes in the comm buffers */
8192                             ind_p         = ind;
8193                             ibuf_p        = &comm->buf_int;
8194                             ibuf_nalloc_p = &comm->nalloc_int;
8195                             vbuf_p        = &comm->vbuf;
8196                             nsend_p       = &nsend;
8197                             nat_p         = &nat;
8198                             nsend_zone_p  = &ind->nsend[zone];
8199                         }
8200                         else
8201                         {
8202                             /* Other threads write into temp buffers */
8203                             ind_p         = &comm->dth[th].ind;
8204                             ibuf_p        = &comm->dth[th].ibuf;
8205                             ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8206                             vbuf_p        = &comm->dth[th].vbuf;
8207                             nsend_p       = &comm->dth[th].nsend;
8208                             nat_p         = &comm->dth[th].nat;
8209                             nsend_zone_p  = &comm->dth[th].nsend_zone;
8210
8211                             comm->dth[th].nsend      = 0;
8212                             comm->dth[th].nat        = 0;
8213                             comm->dth[th].nsend_zone = 0;
8214                         }
8215
8216                         if (comm->nth == 1)
8217                         {
8218                             cg0_th = cg0;
8219                             cg1_th = cg1;
8220                         }
8221                         else
8222                         {
8223                             cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8224                             cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8225                         }
8226
8227                         /* Get the cg's for this pulse in this zone */
8228                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8229                                            index_gl, cgindex,
8230                                            dim, dim_ind, dim0, dim1, dim2,
8231                                            r_comm2, r_bcomm2,
8232                                            box, tric_dist,
8233                                            normal, skew_fac2_d, skew_fac_01,
8234                                            v_d, v_0, v_1, &corners, sf2_round,
8235                                            bDistBonded, bBondComm,
8236                                            bDist2B, bDistMB,
8237                                            cg_cm, fr->cginfo,
8238                                            ind_p,
8239                                            ibuf_p, ibuf_nalloc_p,
8240                                            vbuf_p,
8241                                            nsend_p, nat_p,
8242                                            nsend_zone_p);
8243                     }
8244                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8245                 } // END
8246
8247                 /* Append data of threads>=1 to the communication buffers */
8248                 for (th = 1; th < comm->nth; th++)
8249                 {
8250                     dd_comm_setup_work_t *dth;
8251                     int                   i, ns1;
8252
8253                     dth = &comm->dth[th];
8254
8255                     ns1 = nsend + dth->nsend_zone;
8256                     if (ns1 > ind->nalloc)
8257                     {
8258                         ind->nalloc = over_alloc_dd(ns1);
8259                         srenew(ind->index, ind->nalloc);
8260                     }
8261                     if (ns1 > comm->nalloc_int)
8262                     {
8263                         comm->nalloc_int = over_alloc_dd(ns1);
8264                         srenew(comm->buf_int, comm->nalloc_int);
8265                     }
8266                     if (ns1 > comm->vbuf.nalloc)
8267                     {
8268                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8269                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8270                     }
8271
8272                     for (i = 0; i < dth->nsend_zone; i++)
8273                     {
8274                         ind->index[nsend]    = dth->ind.index[i];
8275                         comm->buf_int[nsend] = dth->ibuf[i];
8276                         copy_rvec(dth->vbuf.v[i],
8277                                   comm->vbuf.v[nsend]);
8278                         nsend++;
8279                     }
8280                     nat              += dth->nat;
8281                     ind->nsend[zone] += dth->nsend_zone;
8282                 }
8283             }
8284             /* Clear the counts in case we do not have pbc */
8285             for (zone = nzone_send; zone < nzone; zone++)
8286             {
8287                 ind->nsend[zone] = 0;
8288             }
8289             ind->nsend[nzone]   = nsend;
8290             ind->nsend[nzone+1] = nat;
8291             /* Communicate the number of cg's and atoms to receive */
8292             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8293                             ind->nsend, nzone+2,
8294                             ind->nrecv, nzone+2);
8295
8296             /* The rvec buffer is also required for atom buffers of size nsend
8297              * in dd_move_x and dd_move_f.
8298              */
8299             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8300
8301             if (p > 0)
8302             {
8303                 /* We can receive in place if only the last zone is not empty */
8304                 for (zone = 0; zone < nzone-1; zone++)
8305                 {
8306                     if (ind->nrecv[zone] > 0)
8307                     {
8308                         cd->bInPlace = FALSE;
8309                     }
8310                 }
8311                 if (!cd->bInPlace)
8312                 {
8313                     /* The int buffer is only required here for the cg indices */
8314                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8315                     {
8316                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8317                         srenew(comm->buf_int2, comm->nalloc_int2);
8318                     }
8319                     /* The rvec buffer is also required for atom buffers
8320                      * of size nrecv in dd_move_x and dd_move_f.
8321                      */
8322                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8323                     vec_rvec_check_alloc(&comm->vbuf2, i);
8324                 }
8325             }
8326
8327             /* Make space for the global cg indices */
8328             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8329                 || dd->cg_nalloc == 0)
8330             {
8331                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8332                 srenew(index_gl, dd->cg_nalloc);
8333                 srenew(cgindex, dd->cg_nalloc+1);
8334             }
8335             /* Communicate the global cg indices */
8336             if (cd->bInPlace)
8337             {
8338                 recv_i = index_gl + pos_cg;
8339             }
8340             else
8341             {
8342                 recv_i = comm->buf_int2;
8343             }
8344             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8345                             comm->buf_int, nsend,
8346                             recv_i,        ind->nrecv[nzone]);
8347
8348             /* Make space for cg_cm */
8349             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8350             if (fr->cutoff_scheme == ecutsGROUP)
8351             {
8352                 cg_cm = fr->cg_cm;
8353             }
8354             else
8355             {
8356                 cg_cm = as_rvec_array(state->x.data());
8357             }
8358             /* Communicate cg_cm */
8359             if (cd->bInPlace)
8360             {
8361                 recv_vr = cg_cm + pos_cg;
8362             }
8363             else
8364             {
8365                 recv_vr = comm->vbuf2.v;
8366             }
8367             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8368                              comm->vbuf.v, nsend,
8369                              recv_vr,      ind->nrecv[nzone]);
8370
8371             /* Make the charge group index */
8372             if (cd->bInPlace)
8373             {
8374                 zone = (p == 0 ? 0 : nzone - 1);
8375                 while (zone < nzone)
8376                 {
8377                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8378                     {
8379                         cg_gl              = index_gl[pos_cg];
8380                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8381                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8382                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8383                         if (bBondComm)
8384                         {
8385                             /* Update the charge group presence,
8386                              * so we can use it in the next pass of the loop.
8387                              */
8388                             comm->bLocalCG[cg_gl] = TRUE;
8389                         }
8390                         pos_cg++;
8391                     }
8392                     if (p == 0)
8393                     {
8394                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8395                     }
8396                     zone++;
8397                     zone_cg_range[nzone+zone] = pos_cg;
8398                 }
8399             }
8400             else
8401             {
8402                 /* This part of the code is never executed with bBondComm. */
8403                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8404                                  index_gl, recv_i, cg_cm, recv_vr,
8405                                  cgindex, fr->cginfo_mb, fr->cginfo);
8406                 pos_cg += ind->nrecv[nzone];
8407             }
8408             nat_tot += ind->nrecv[nzone+1];
8409         }
8410         if (!cd->bInPlace)
8411         {
8412             /* Store the atom block for easy copying of communication buffers */
8413             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8414         }
8415         nzone += nzone;
8416     }
8417     dd->index_gl = index_gl;
8418     dd->cgindex  = cgindex;
8419
8420     dd->ncg_tot          = zone_cg_range[zones->n];
8421     dd->nat_tot          = nat_tot;
8422     comm->nat[ddnatHOME] = dd->nat_home;
8423     for (i = ddnatZONE; i < ddnatNR; i++)
8424     {
8425         comm->nat[i] = dd->nat_tot;
8426     }
8427
8428     if (!bBondComm)
8429     {
8430         /* We don't need to update cginfo, since that was alrady done above.
8431          * So we pass NULL for the forcerec.
8432          */
8433         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8434                       NULL, comm->bLocalCG);
8435     }
8436
8437     if (debug)
8438     {
8439         fprintf(debug, "Finished setting up DD communication, zones:");
8440         for (c = 0; c < zones->n; c++)
8441         {
8442             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8443         }
8444         fprintf(debug, "\n");
8445     }
8446 }
8447
8448 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8449 {
8450     int c;
8451
8452     for (c = 0; c < zones->nizone; c++)
8453     {
8454         zones->izone[c].cg1  = zones->cg_range[c+1];
8455         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8456         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8457     }
8458 }
8459
8460 static void set_zones_size(gmx_domdec_t *dd,
8461                            matrix box, const gmx_ddbox_t *ddbox,
8462                            int zone_start, int zone_end)
8463 {
8464     gmx_domdec_comm_t  *comm;
8465     gmx_domdec_zones_t *zones;
8466     gmx_bool            bDistMB;
8467     int                 z, zi, d, dim;
8468     real                rcs, rcmbs;
8469     int                 i, j;
8470     real                vol;
8471
8472     comm = dd->comm;
8473
8474     zones = &comm->zones;
8475
8476     /* Do we need to determine extra distances for multi-body bondeds? */
8477     bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8478
8479     for (z = zone_start; z < zone_end; z++)
8480     {
8481         /* Copy cell limits to zone limits.
8482          * Valid for non-DD dims and non-shifted dims.
8483          */
8484         copy_rvec(comm->cell_x0, zones->size[z].x0);
8485         copy_rvec(comm->cell_x1, zones->size[z].x1);
8486     }
8487
8488     for (d = 0; d < dd->ndim; d++)
8489     {
8490         dim = dd->dim[d];
8491
8492         for (z = 0; z < zones->n; z++)
8493         {
8494             /* With a staggered grid we have different sizes
8495              * for non-shifted dimensions.
8496              */
8497             if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8498             {
8499                 if (d == 1)
8500                 {
8501                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8502                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8503                 }
8504                 else if (d == 2)
8505                 {
8506                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8507                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8508                 }
8509             }
8510         }
8511
8512         rcs   = comm->cutoff;
8513         rcmbs = comm->cutoff_mbody;
8514         if (ddbox->tric_dir[dim])
8515         {
8516             rcs   /= ddbox->skew_fac[dim];
8517             rcmbs /= ddbox->skew_fac[dim];
8518         }
8519
8520         /* Set the lower limit for the shifted zone dimensions */
8521         for (z = zone_start; z < zone_end; z++)
8522         {
8523             if (zones->shift[z][dim] > 0)
8524             {
8525                 dim = dd->dim[d];
8526                 if (!dlbIsOn(dd->comm) || d == 0)
8527                 {
8528                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8529                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8530                 }
8531                 else
8532                 {
8533                     /* Here we take the lower limit of the zone from
8534                      * the lowest domain of the zone below.
8535                      */
8536                     if (z < 4)
8537                     {
8538                         zones->size[z].x0[dim] =
8539                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8540                     }
8541                     else
8542                     {
8543                         if (d == 1)
8544                         {
8545                             zones->size[z].x0[dim] =
8546                                 zones->size[zone_perm[2][z-4]].x0[dim];
8547                         }
8548                         else
8549                         {
8550                             zones->size[z].x0[dim] =
8551                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8552                         }
8553                     }
8554                     /* A temporary limit, is updated below */
8555                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8556
8557                     if (bDistMB)
8558                     {
8559                         for (zi = 0; zi < zones->nizone; zi++)
8560                         {
8561                             if (zones->shift[zi][dim] == 0)
8562                             {
8563                                 /* This takes the whole zone into account.
8564                                  * With multiple pulses this will lead
8565                                  * to a larger zone then strictly necessary.
8566                                  */
8567                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8568                                                                   zones->size[zi].x1[dim]+rcmbs);
8569                             }
8570                         }
8571                     }
8572                 }
8573             }
8574         }
8575
8576         /* Loop over the i-zones to set the upper limit of each
8577          * j-zone they see.
8578          */
8579         for (zi = 0; zi < zones->nizone; zi++)
8580         {
8581             if (zones->shift[zi][dim] == 0)
8582             {
8583                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8584                 {
8585                     if (zones->shift[z][dim] > 0)
8586                     {
8587                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8588                                                           zones->size[zi].x1[dim]+rcs);
8589                     }
8590                 }
8591             }
8592         }
8593     }
8594
8595     for (z = zone_start; z < zone_end; z++)
8596     {
8597         /* Initialization only required to keep the compiler happy */
8598         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8599         int  nc, c;
8600
8601         /* To determine the bounding box for a zone we need to find
8602          * the extreme corners of 4, 2 or 1 corners.
8603          */
8604         nc = 1 << (ddbox->nboundeddim - 1);
8605
8606         for (c = 0; c < nc; c++)
8607         {
8608             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8609             corner[XX] = 0;
8610             if ((c & 1) == 0)
8611             {
8612                 corner[YY] = zones->size[z].x0[YY];
8613             }
8614             else
8615             {
8616                 corner[YY] = zones->size[z].x1[YY];
8617             }
8618             if ((c & 2) == 0)
8619             {
8620                 corner[ZZ] = zones->size[z].x0[ZZ];
8621             }
8622             else
8623             {
8624                 corner[ZZ] = zones->size[z].x1[ZZ];
8625             }
8626             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8627                 box[ZZ][1 - dd->dim[0]] != 0)
8628             {
8629                 /* With 1D domain decomposition the cg's are not in
8630                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8631                  * Shift the corner of the z-vector back to along the box
8632                  * vector of dimension d, so it will later end up at 0 along d.
8633                  * This can affect the location of this corner along dd->dim[0]
8634                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8635                  */
8636                 int d = 1 - dd->dim[0];
8637
8638                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8639             }
8640             /* Apply the triclinic couplings */
8641             assert(ddbox->npbcdim <= DIM);
8642             for (i = YY; i < ddbox->npbcdim; i++)
8643             {
8644                 for (j = XX; j < i; j++)
8645                 {
8646                     corner[j] += corner[i]*box[i][j]/box[i][i];
8647                 }
8648             }
8649             if (c == 0)
8650             {
8651                 copy_rvec(corner, corner_min);
8652                 copy_rvec(corner, corner_max);
8653             }
8654             else
8655             {
8656                 for (i = 0; i < DIM; i++)
8657                 {
8658                     corner_min[i] = std::min(corner_min[i], corner[i]);
8659                     corner_max[i] = std::max(corner_max[i], corner[i]);
8660                 }
8661             }
8662         }
8663         /* Copy the extreme cornes without offset along x */
8664         for (i = 0; i < DIM; i++)
8665         {
8666             zones->size[z].bb_x0[i] = corner_min[i];
8667             zones->size[z].bb_x1[i] = corner_max[i];
8668         }
8669         /* Add the offset along x */
8670         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8671         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8672     }
8673
8674     if (zone_start == 0)
8675     {
8676         vol = 1;
8677         for (dim = 0; dim < DIM; dim++)
8678         {
8679             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8680         }
8681         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8682     }
8683
8684     if (debug)
8685     {
8686         for (z = zone_start; z < zone_end; z++)
8687         {
8688             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8689                     z,
8690                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8691                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8692                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8693             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8694                     z,
8695                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8696                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8697                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8698         }
8699     }
8700 }
8701
8702 static int comp_cgsort(const void *a, const void *b)
8703 {
8704     int           comp;
8705
8706     gmx_cgsort_t *cga, *cgb;
8707     cga = (gmx_cgsort_t *)a;
8708     cgb = (gmx_cgsort_t *)b;
8709
8710     comp = cga->nsc - cgb->nsc;
8711     if (comp == 0)
8712     {
8713         comp = cga->ind_gl - cgb->ind_gl;
8714     }
8715
8716     return comp;
8717 }
8718
8719 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8720                          int *a, int *buf)
8721 {
8722     int i;
8723
8724     /* Order the data */
8725     for (i = 0; i < n; i++)
8726     {
8727         buf[i] = a[sort[i].ind];
8728     }
8729
8730     /* Copy back to the original array */
8731     for (i = 0; i < n; i++)
8732     {
8733         a[i] = buf[i];
8734     }
8735 }
8736
8737 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8738                          rvec *v, rvec *buf)
8739 {
8740     int i;
8741
8742     /* Order the data */
8743     for (i = 0; i < n; i++)
8744     {
8745         copy_rvec(v[sort[i].ind], buf[i]);
8746     }
8747
8748     /* Copy back to the original array */
8749     for (i = 0; i < n; i++)
8750     {
8751         copy_rvec(buf[i], v[i]);
8752     }
8753 }
8754
8755 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8756                            rvec *v, rvec *buf)
8757 {
8758     int a, atot, cg, cg0, cg1, i;
8759
8760     if (cgindex == NULL)
8761     {
8762         /* Avoid the useless loop of the atoms within a cg */
8763         order_vec_cg(ncg, sort, v, buf);
8764
8765         return;
8766     }
8767
8768     /* Order the data */
8769     a = 0;
8770     for (cg = 0; cg < ncg; cg++)
8771     {
8772         cg0 = cgindex[sort[cg].ind];
8773         cg1 = cgindex[sort[cg].ind+1];
8774         for (i = cg0; i < cg1; i++)
8775         {
8776             copy_rvec(v[i], buf[a]);
8777             a++;
8778         }
8779     }
8780     atot = a;
8781
8782     /* Copy back to the original array */
8783     for (a = 0; a < atot; a++)
8784     {
8785         copy_rvec(buf[a], v[a]);
8786     }
8787 }
8788
8789 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8790                          int nsort_new, gmx_cgsort_t *sort_new,
8791                          gmx_cgsort_t *sort1)
8792 {
8793     int i1, i2, i_new;
8794
8795     /* The new indices are not very ordered, so we qsort them */
8796     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8797
8798     /* sort2 is already ordered, so now we can merge the two arrays */
8799     i1    = 0;
8800     i2    = 0;
8801     i_new = 0;
8802     while (i2 < nsort2 || i_new < nsort_new)
8803     {
8804         if (i2 == nsort2)
8805         {
8806             sort1[i1++] = sort_new[i_new++];
8807         }
8808         else if (i_new == nsort_new)
8809         {
8810             sort1[i1++] = sort2[i2++];
8811         }
8812         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8813                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8814                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8815         {
8816             sort1[i1++] = sort2[i2++];
8817         }
8818         else
8819         {
8820             sort1[i1++] = sort_new[i_new++];
8821         }
8822     }
8823 }
8824
8825 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8826 {
8827     gmx_domdec_sort_t *sort;
8828     gmx_cgsort_t      *cgsort, *sort_i;
8829     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8830
8831     sort = dd->comm->sort;
8832
8833     a = fr->ns->grid->cell_index;
8834
8835     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8836
8837     if (ncg_home_old >= 0)
8838     {
8839         /* The charge groups that remained in the same ns grid cell
8840          * are completely ordered. So we can sort efficiently by sorting
8841          * the charge groups that did move into the stationary list.
8842          */
8843         ncg_new   = 0;
8844         nsort2    = 0;
8845         nsort_new = 0;
8846         for (i = 0; i < dd->ncg_home; i++)
8847         {
8848             /* Check if this cg did not move to another node */
8849             if (a[i] < moved)
8850             {
8851                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8852                 {
8853                     /* This cg is new on this node or moved ns grid cell */
8854                     if (nsort_new >= sort->sort_new_nalloc)
8855                     {
8856                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8857                         srenew(sort->sort_new, sort->sort_new_nalloc);
8858                     }
8859                     sort_i = &(sort->sort_new[nsort_new++]);
8860                 }
8861                 else
8862                 {
8863                     /* This cg did not move */
8864                     sort_i = &(sort->sort2[nsort2++]);
8865                 }
8866                 /* Sort on the ns grid cell indices
8867                  * and the global topology index.
8868                  * index_gl is irrelevant with cell ns,
8869                  * but we set it here anyhow to avoid a conditional.
8870                  */
8871                 sort_i->nsc    = a[i];
8872                 sort_i->ind_gl = dd->index_gl[i];
8873                 sort_i->ind    = i;
8874                 ncg_new++;
8875             }
8876         }
8877         if (debug)
8878         {
8879             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8880                     nsort2, nsort_new);
8881         }
8882         /* Sort efficiently */
8883         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8884                      sort->sort);
8885     }
8886     else
8887     {
8888         cgsort  = sort->sort;
8889         ncg_new = 0;
8890         for (i = 0; i < dd->ncg_home; i++)
8891         {
8892             /* Sort on the ns grid cell indices
8893              * and the global topology index
8894              */
8895             cgsort[i].nsc    = a[i];
8896             cgsort[i].ind_gl = dd->index_gl[i];
8897             cgsort[i].ind    = i;
8898             if (cgsort[i].nsc < moved)
8899             {
8900                 ncg_new++;
8901             }
8902         }
8903         if (debug)
8904         {
8905             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8906         }
8907         /* Determine the order of the charge groups using qsort */
8908         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8909     }
8910
8911     return ncg_new;
8912 }
8913
8914 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8915 {
8916     gmx_cgsort_t *sort;
8917     int           ncg_new, i, na;
8918     const int    *a;
8919
8920     sort = dd->comm->sort->sort;
8921
8922     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8923
8924     ncg_new = 0;
8925     for (i = 0; i < na; i++)
8926     {
8927         if (a[i] >= 0)
8928         {
8929             sort[ncg_new].ind = a[i];
8930             ncg_new++;
8931         }
8932     }
8933
8934     return ncg_new;
8935 }
8936
8937 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8938                           int ncg_home_old)
8939 {
8940     gmx_domdec_sort_t *sort;
8941     gmx_cgsort_t      *cgsort;
8942     int               *cgindex;
8943     int                ncg_new, i, *ibuf, cgsize;
8944     rvec              *vbuf;
8945
8946     sort = dd->comm->sort;
8947
8948     if (dd->ncg_home > sort->sort_nalloc)
8949     {
8950         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8951         srenew(sort->sort, sort->sort_nalloc);
8952         srenew(sort->sort2, sort->sort_nalloc);
8953     }
8954     cgsort = sort->sort;
8955
8956     switch (fr->cutoff_scheme)
8957     {
8958         case ecutsGROUP:
8959             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8960             break;
8961         case ecutsVERLET:
8962             ncg_new = dd_sort_order_nbnxn(dd, fr);
8963             break;
8964         default:
8965             gmx_incons("unimplemented");
8966             ncg_new = 0;
8967     }
8968
8969     /* We alloc with the old size, since cgindex is still old */
8970     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8971     vbuf = dd->comm->vbuf.v;
8972
8973     if (dd->comm->bCGs)
8974     {
8975         cgindex = dd->cgindex;
8976     }
8977     else
8978     {
8979         cgindex = NULL;
8980     }
8981
8982     /* Remove the charge groups which are no longer at home here */
8983     dd->ncg_home = ncg_new;
8984     if (debug)
8985     {
8986         fprintf(debug, "Set the new home charge group count to %d\n",
8987                 dd->ncg_home);
8988     }
8989
8990     /* Reorder the state */
8991     for (i = 0; i < estNR; i++)
8992     {
8993         if (EST_DISTR(i) && (state->flags & (1<<i)))
8994         {
8995             switch (i)
8996             {
8997                 case estX:
8998                     order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
8999                     break;
9000                 case estV:
9001                     order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
9002                     break;
9003                 case est_SDX_NOTSUPPORTED:
9004                     break;
9005                 case estCGP:
9006                     order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
9007                     break;
9008                 case estLD_RNG:
9009                 case estLD_RNGI:
9010                 case estDISRE_INITF:
9011                 case estDISRE_RM3TAV:
9012                 case estORIRE_INITF:
9013                 case estORIRE_DTAV:
9014                     /* No ordering required */
9015                     break;
9016                 default:
9017                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9018                     break;
9019             }
9020         }
9021     }
9022     if (fr->cutoff_scheme == ecutsGROUP)
9023     {
9024         /* Reorder cgcm */
9025         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9026     }
9027
9028     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9029     {
9030         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9031         srenew(sort->ibuf, sort->ibuf_nalloc);
9032     }
9033     ibuf = sort->ibuf;
9034     /* Reorder the global cg index */
9035     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9036     /* Reorder the cginfo */
9037     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9038     /* Rebuild the local cg index */
9039     if (dd->comm->bCGs)
9040     {
9041         ibuf[0] = 0;
9042         for (i = 0; i < dd->ncg_home; i++)
9043         {
9044             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9045             ibuf[i+1] = ibuf[i] + cgsize;
9046         }
9047         for (i = 0; i < dd->ncg_home+1; i++)
9048         {
9049             dd->cgindex[i] = ibuf[i];
9050         }
9051     }
9052     else
9053     {
9054         for (i = 0; i < dd->ncg_home+1; i++)
9055         {
9056             dd->cgindex[i] = i;
9057         }
9058     }
9059     /* Set the home atom number */
9060     dd->nat_home = dd->cgindex[dd->ncg_home];
9061
9062     if (fr->cutoff_scheme == ecutsVERLET)
9063     {
9064         /* The atoms are now exactly in grid order, update the grid order */
9065         nbnxn_set_atomorder(fr->nbv->nbs);
9066     }
9067     else
9068     {
9069         /* Copy the sorted ns cell indices back to the ns grid struct */
9070         for (i = 0; i < dd->ncg_home; i++)
9071         {
9072             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9073         }
9074         fr->ns->grid->nr = dd->ncg_home;
9075     }
9076 }
9077
9078 static void add_dd_statistics(gmx_domdec_t *dd)
9079 {
9080     gmx_domdec_comm_t *comm;
9081     int                ddnat;
9082
9083     comm = dd->comm;
9084
9085     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9086     {
9087         comm->sum_nat[ddnat-ddnatZONE] +=
9088             comm->nat[ddnat] - comm->nat[ddnat-1];
9089     }
9090     comm->ndecomp++;
9091 }
9092
9093 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9094 {
9095     gmx_domdec_comm_t *comm;
9096     int                ddnat;
9097
9098     comm = dd->comm;
9099
9100     /* Reset all the statistics and counters for total run counting */
9101     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9102     {
9103         comm->sum_nat[ddnat-ddnatZONE] = 0;
9104     }
9105     comm->ndecomp   = 0;
9106     comm->nload     = 0;
9107     comm->load_step = 0;
9108     comm->load_sum  = 0;
9109     comm->load_max  = 0;
9110     clear_ivec(comm->load_lim);
9111     comm->load_mdf = 0;
9112     comm->load_pme = 0;
9113 }
9114
9115 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9116 {
9117     gmx_domdec_comm_t *comm;
9118     int                ddnat;
9119     double             av;
9120
9121     comm = cr->dd->comm;
9122
9123     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9124
9125     if (fplog == NULL)
9126     {
9127         return;
9128     }
9129
9130     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9131
9132     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9133     {
9134         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9135         switch (ddnat)
9136         {
9137             case ddnatZONE:
9138                 fprintf(fplog,
9139                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9140                         2, av);
9141                 break;
9142             case ddnatVSITE:
9143                 if (cr->dd->vsite_comm)
9144                 {
9145                     fprintf(fplog,
9146                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9147                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9148                             av);
9149                 }
9150                 break;
9151             case ddnatCON:
9152                 if (cr->dd->constraint_comm)
9153                 {
9154                     fprintf(fplog,
9155                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9156                             1 + ir->nLincsIter, av);
9157                 }
9158                 break;
9159             default:
9160                 gmx_incons(" Unknown type for DD statistics");
9161         }
9162     }
9163     fprintf(fplog, "\n");
9164
9165     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9166     {
9167         print_dd_load_av(fplog, cr->dd);
9168     }
9169 }
9170
9171 void dd_partition_system(FILE                *fplog,
9172                          gmx_int64_t          step,
9173                          t_commrec           *cr,
9174                          gmx_bool             bMasterState,
9175                          int                  nstglobalcomm,
9176                          t_state             *state_global,
9177                          const gmx_mtop_t    *top_global,
9178                          const t_inputrec    *ir,
9179                          t_state             *state_local,
9180                          PaddedRVecVector    *f,
9181                          t_mdatoms           *mdatoms,
9182                          gmx_localtop_t      *top_local,
9183                          t_forcerec          *fr,
9184                          gmx_vsite_t         *vsite,
9185                          gmx_constr_t         constr,
9186                          t_nrnb              *nrnb,
9187                          gmx_wallcycle_t      wcycle,
9188                          gmx_bool             bVerbose)
9189 {
9190     gmx_domdec_t      *dd;
9191     gmx_domdec_comm_t *comm;
9192     gmx_ddbox_t        ddbox = {0};
9193     t_block           *cgs_gl;
9194     gmx_int64_t        step_pcoupl;
9195     rvec               cell_ns_x0, cell_ns_x1;
9196     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9197     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9198     gmx_bool           bRedist, bSortCG, bResortAll;
9199     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9200     real               grid_density;
9201     char               sbuf[22];
9202
9203     wallcycle_start(wcycle, ewcDOMDEC);
9204
9205     dd   = cr->dd;
9206     comm = dd->comm;
9207
9208     bBoxChanged = (bMasterState || inputrecDeform(ir));
9209     if (ir->epc != epcNO)
9210     {
9211         /* With nstpcouple > 1 pressure coupling happens.
9212          * one step after calculating the pressure.
9213          * Box scaling happens at the end of the MD step,
9214          * after the DD partitioning.
9215          * We therefore have to do DLB in the first partitioning
9216          * after an MD step where P-coupling occurred.
9217          * We need to determine the last step in which p-coupling occurred.
9218          * MRS -- need to validate this for vv?
9219          */
9220         n = ir->nstpcouple;
9221         if (n == 1)
9222         {
9223             step_pcoupl = step - 1;
9224         }
9225         else
9226         {
9227             step_pcoupl = ((step - 1)/n)*n + 1;
9228         }
9229         if (step_pcoupl >= comm->partition_step)
9230         {
9231             bBoxChanged = TRUE;
9232         }
9233     }
9234
9235     bNStGlobalComm = (step % nstglobalcomm == 0);
9236
9237     if (!dlbIsOn(comm))
9238     {
9239         bDoDLB = FALSE;
9240     }
9241     else
9242     {
9243         /* Should we do dynamic load balacing this step?
9244          * Since it requires (possibly expensive) global communication,
9245          * we might want to do DLB less frequently.
9246          */
9247         if (bBoxChanged || ir->epc != epcNO)
9248         {
9249             bDoDLB = bBoxChanged;
9250         }
9251         else
9252         {
9253             bDoDLB = bNStGlobalComm;
9254         }
9255     }
9256
9257     /* Check if we have recorded loads on the nodes */
9258     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9259     {
9260         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9261
9262         /* Print load every nstlog, first and last step to the log file */
9263         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9264                     comm->n_load_collect == 0 ||
9265                     (ir->nsteps >= 0 &&
9266                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9267
9268         /* Avoid extra communication due to verbose screen output
9269          * when nstglobalcomm is set.
9270          */
9271         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9272             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9273         {
9274             get_load_distribution(dd, wcycle);
9275             if (DDMASTER(dd))
9276             {
9277                 if (bLogLoad)
9278                 {
9279                     dd_print_load(fplog, dd, step-1);
9280                 }
9281                 if (bVerbose)
9282                 {
9283                     dd_print_load_verbose(dd);
9284                 }
9285             }
9286             comm->n_load_collect++;
9287
9288             if (dlbIsOn(comm))
9289             {
9290                 if (DDMASTER(dd))
9291                 {
9292                     /* Add the measured cycles to the running average */
9293                     const float averageFactor        = 0.1f;
9294                     comm->cyclesPerStepDlbExpAverage =
9295                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9296                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9297                 }
9298                 if (comm->dlbState == edlbsOnCanTurnOff &&
9299                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9300                 {
9301                     gmx_bool turnOffDlb;
9302                     if (DDMASTER(dd))
9303                     {
9304                         /* If the running averaged cycles with DLB are more
9305                          * than before we turned on DLB, turn off DLB.
9306                          * We will again run and check the cycles without DLB
9307                          * and we can then decide if to turn off DLB forever.
9308                          */
9309                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9310                                       comm->cyclesPerStepBeforeDLB);
9311                     }
9312                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9313                     if (turnOffDlb)
9314                     {
9315                         /* To turn off DLB, we need to redistribute the atoms */
9316                         dd_collect_state(dd, state_local, state_global);
9317                         bMasterState = TRUE;
9318                         turn_off_dlb(fplog, cr, step);
9319                     }
9320                 }
9321             }
9322             else if (bCheckWhetherToTurnDlbOn)
9323             {
9324                 gmx_bool turnOffDlbForever = FALSE;
9325                 gmx_bool turnOnDlb         = FALSE;
9326
9327                 /* Since the timings are node dependent, the master decides */
9328                 if (DDMASTER(dd))
9329                 {
9330                     /* If we recently turned off DLB, we want to check if
9331                      * performance is better without DLB. We want to do this
9332                      * ASAP to minimize the chance that external factors
9333                      * slowed down the DLB step are gone here and we
9334                      * incorrectly conclude that DLB was causing the slowdown.
9335                      * So we measure one nstlist block, no running average.
9336                      */
9337                     if (comm->haveTurnedOffDlb &&
9338                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9339                         comm->cyclesPerStepDlbExpAverage)
9340                     {
9341                         /* After turning off DLB we ran nstlist steps in fewer
9342                          * cycles than with DLB. This likely means that DLB
9343                          * in not benefical, but this could be due to a one
9344                          * time unlucky fluctuation, so we require two such
9345                          * observations in close succession to turn off DLB
9346                          * forever.
9347                          */
9348                         if (comm->dlbSlowerPartitioningCount > 0 &&
9349                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9350                         {
9351                             turnOffDlbForever = TRUE;
9352                         }
9353                         comm->haveTurnedOffDlb           = false;
9354                         /* Register when we last measured DLB slowdown */
9355                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
9356                     }
9357                     else
9358                     {
9359                         /* Here we check if the max PME rank load is more than 0.98
9360                          * the max PP force load. If so, PP DLB will not help,
9361                          * since we are (almost) limited by PME. Furthermore,
9362                          * DLB will cause a significant extra x/f redistribution
9363                          * cost on the PME ranks, which will then surely result
9364                          * in lower total performance.
9365                          */
9366                         if (cr->npmenodes > 0 &&
9367                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9368                         {
9369                             turnOnDlb = FALSE;
9370                         }
9371                         else
9372                         {
9373                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9374                         }
9375                     }
9376                 }
9377                 struct
9378                 {
9379                     gmx_bool turnOffDlbForever;
9380                     gmx_bool turnOnDlb;
9381                 }
9382                 bools {
9383                     turnOffDlbForever, turnOnDlb
9384                 };
9385                 dd_bcast(dd, sizeof(bools), &bools);
9386                 if (bools.turnOffDlbForever)
9387                 {
9388                     turn_off_dlb_forever(fplog, cr, step);
9389                 }
9390                 else if (bools.turnOnDlb)
9391                 {
9392                     turn_on_dlb(fplog, cr, step);
9393                     bDoDLB = TRUE;
9394                 }
9395             }
9396         }
9397         comm->n_load_have++;
9398     }
9399
9400     cgs_gl = &comm->cgs_gl;
9401
9402     bRedist = FALSE;
9403     if (bMasterState)
9404     {
9405         /* Clear the old state */
9406         clear_dd_indices(dd, 0, 0);
9407         ncgindex_set = 0;
9408
9409         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9410                   TRUE, cgs_gl, as_rvec_array(state_global->x.data()), &ddbox);
9411
9412         get_cg_distribution(fplog, dd, cgs_gl,
9413                             state_global->box, &ddbox, as_rvec_array(state_global->x.data()));
9414
9415         dd_distribute_state(dd, cgs_gl,
9416                             state_global, state_local, f);
9417
9418         dd_make_local_cgs(dd, &top_local->cgs);
9419
9420         /* Ensure that we have space for the new distribution */
9421         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9422
9423         if (fr->cutoff_scheme == ecutsGROUP)
9424         {
9425             calc_cgcm(fplog, 0, dd->ncg_home,
9426                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9427         }
9428
9429         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9430
9431         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9432     }
9433     else if (state_local->ddp_count != dd->ddp_count)
9434     {
9435         if (state_local->ddp_count > dd->ddp_count)
9436         {
9437             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9438         }
9439
9440         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9441         {
9442             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9443         }
9444
9445         /* Clear the old state */
9446         clear_dd_indices(dd, 0, 0);
9447
9448         /* Build the new indices */
9449         rebuild_cgindex(dd, cgs_gl->index, state_local);
9450         make_dd_indices(dd, cgs_gl->index, 0);
9451         ncgindex_set = dd->ncg_home;
9452
9453         if (fr->cutoff_scheme == ecutsGROUP)
9454         {
9455             /* Redetermine the cg COMs */
9456             calc_cgcm(fplog, 0, dd->ncg_home,
9457                       &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9458         }
9459
9460         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9461
9462         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9463
9464         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9465                   TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9466
9467         bRedist = dlbIsOn(comm);
9468     }
9469     else
9470     {
9471         /* We have the full state, only redistribute the cgs */
9472
9473         /* Clear the non-home indices */
9474         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9475         ncgindex_set = 0;
9476
9477         /* Avoid global communication for dim's without pbc and -gcom */
9478         if (!bNStGlobalComm)
9479         {
9480             copy_rvec(comm->box0, ddbox.box0    );
9481             copy_rvec(comm->box_size, ddbox.box_size);
9482         }
9483         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9484                   bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9485
9486         bBoxChanged = TRUE;
9487         bRedist     = TRUE;
9488     }
9489     /* For dim's without pbc and -gcom */
9490     copy_rvec(ddbox.box0, comm->box0    );
9491     copy_rvec(ddbox.box_size, comm->box_size);
9492
9493     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9494                       step, wcycle);
9495
9496     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9497     {
9498         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9499     }
9500
9501     /* Check if we should sort the charge groups */
9502     bSortCG = (bMasterState || bRedist);
9503
9504     ncg_home_old = dd->ncg_home;
9505
9506     ncg_moved = 0;
9507     if (bRedist)
9508     {
9509         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9510
9511         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9512                            state_local, f, fr,
9513                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9514
9515         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9516     }
9517
9518     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9519                           dd, &ddbox,
9520                           &comm->cell_x0, &comm->cell_x1,
9521                           dd->ncg_home, fr->cg_cm,
9522                           cell_ns_x0, cell_ns_x1, &grid_density);
9523
9524     if (bBoxChanged)
9525     {
9526         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9527     }
9528
9529     switch (fr->cutoff_scheme)
9530     {
9531         case ecutsGROUP:
9532             copy_ivec(fr->ns->grid->n, ncells_old);
9533             grid_first(fplog, fr->ns->grid, dd, &ddbox,
9534                        state_local->box, cell_ns_x0, cell_ns_x1,
9535                        fr->rlist, grid_density);
9536             break;
9537         case ecutsVERLET:
9538             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9539             break;
9540         default:
9541             gmx_incons("unimplemented");
9542     }
9543     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9544     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9545
9546     if (bSortCG)
9547     {
9548         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9549
9550         /* Sort the state on charge group position.
9551          * This enables exact restarts from this step.
9552          * It also improves performance by about 15% with larger numbers
9553          * of atoms per node.
9554          */
9555
9556         /* Fill the ns grid with the home cell,
9557          * so we can sort with the indices.
9558          */
9559         set_zones_ncg_home(dd);
9560
9561         switch (fr->cutoff_scheme)
9562         {
9563             case ecutsVERLET:
9564                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9565
9566                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9567                                   0,
9568                                   comm->zones.size[0].bb_x0,
9569                                   comm->zones.size[0].bb_x1,
9570                                   0, dd->ncg_home,
9571                                   comm->zones.dens_zone0,
9572                                   fr->cginfo,
9573                                   as_rvec_array(state_local->x.data()),
9574                                   ncg_moved, bRedist ? comm->moved : NULL,
9575                                   fr->nbv->grp[eintLocal].kernel_type,
9576                                   fr->nbv->grp[eintLocal].nbat);
9577
9578                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9579                 break;
9580             case ecutsGROUP:
9581                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9582                           0, dd->ncg_home, fr->cg_cm);
9583
9584                 copy_ivec(fr->ns->grid->n, ncells_new);
9585                 break;
9586             default:
9587                 gmx_incons("unimplemented");
9588         }
9589
9590         bResortAll = bMasterState;
9591
9592         /* Check if we can user the old order and ns grid cell indices
9593          * of the charge groups to sort the charge groups efficiently.
9594          */
9595         if (ncells_new[XX] != ncells_old[XX] ||
9596             ncells_new[YY] != ncells_old[YY] ||
9597             ncells_new[ZZ] != ncells_old[ZZ])
9598         {
9599             bResortAll = TRUE;
9600         }
9601
9602         if (debug)
9603         {
9604             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9605                     gmx_step_str(step, sbuf), dd->ncg_home);
9606         }
9607         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9608                       bResortAll ? -1 : ncg_home_old);
9609
9610         /* After sorting and compacting we set the correct size */
9611         dd_resize_state(state_local, f, dd->nat_home);
9612
9613         /* Rebuild all the indices */
9614         ga2la_clear(dd->ga2la);
9615         ncgindex_set = 0;
9616
9617         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9618     }
9619
9620     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9621
9622     /* Setup up the communication and communicate the coordinates */
9623     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9624
9625     /* Set the indices */
9626     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9627
9628     /* Set the charge group boundaries for neighbor searching */
9629     set_cg_boundaries(&comm->zones);
9630
9631     if (fr->cutoff_scheme == ecutsVERLET)
9632     {
9633         set_zones_size(dd, state_local->box, &ddbox,
9634                        bSortCG ? 1 : 0, comm->zones.n);
9635     }
9636
9637     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9638
9639     /*
9640        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9641                  -1,as_rvec_array(state_local->x.data()),state_local->box);
9642      */
9643
9644     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9645
9646     /* Extract a local topology from the global topology */
9647     for (i = 0; i < dd->ndim; i++)
9648     {
9649         np[dd->dim[i]] = comm->cd[i].np;
9650     }
9651     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9652                       comm->cellsize_min, np,
9653                       fr,
9654                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9655                       vsite, top_global, top_local);
9656
9657     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9658
9659     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9660
9661     /* Set up the special atom communication */
9662     n = comm->nat[ddnatZONE];
9663     for (i = ddnatZONE+1; i < ddnatNR; i++)
9664     {
9665         switch (i)
9666         {
9667             case ddnatVSITE:
9668                 if (vsite && vsite->n_intercg_vsite)
9669                 {
9670                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9671                 }
9672                 break;
9673             case ddnatCON:
9674                 if (dd->bInterCGcons || dd->bInterCGsettles)
9675                 {
9676                     /* Only for inter-cg constraints we need special code */
9677                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9678                                                   constr, ir->nProjOrder,
9679                                                   top_local->idef.il);
9680                 }
9681                 break;
9682             default:
9683                 gmx_incons("Unknown special atom type setup");
9684         }
9685         comm->nat[i] = n;
9686     }
9687
9688     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9689
9690     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9691
9692     /* Make space for the extra coordinates for virtual site
9693      * or constraint communication.
9694      */
9695     state_local->natoms = comm->nat[ddnatNR-1];
9696
9697     dd_resize_state(state_local, f, state_local->natoms);
9698
9699     if (fr->bF_NoVirSum)
9700     {
9701         if (vsite && vsite->n_intercg_vsite)
9702         {
9703             nat_f_novirsum = comm->nat[ddnatVSITE];
9704         }
9705         else
9706         {
9707             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9708             {
9709                 nat_f_novirsum = dd->nat_tot;
9710             }
9711             else
9712             {
9713                 nat_f_novirsum = dd->nat_home;
9714             }
9715         }
9716     }
9717     else
9718     {
9719         nat_f_novirsum = 0;
9720     }
9721
9722     /* Set the number of atoms required for the force calculation.
9723      * Forces need to be constrained when doing energy
9724      * minimization. For simple simulations we could avoid some
9725      * allocation, zeroing and copying, but this is probably not worth
9726      * the complications and checking.
9727      */
9728     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9729                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9730
9731     /* Update atom data for mdatoms and several algorithms */
9732     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9733                               NULL, mdatoms, vsite, NULL);
9734
9735     if (ir->implicit_solvent)
9736     {
9737         make_local_gb(cr, fr->born, ir->gb_algorithm);
9738     }
9739
9740     if (!(cr->duty & DUTY_PME))
9741     {
9742         /* Send the charges and/or c6/sigmas to our PME only node */
9743         gmx_pme_send_parameters(cr,
9744                                 fr->ic,
9745                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9746                                 mdatoms->chargeA, mdatoms->chargeB,
9747                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9748                                 mdatoms->sigmaA, mdatoms->sigmaB,
9749                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9750     }
9751
9752     if (constr)
9753     {
9754         set_constraints(constr, top_local, ir, mdatoms, cr);
9755     }
9756
9757     if (ir->bPull)
9758     {
9759         /* Update the local pull groups */
9760         dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9761     }
9762
9763     if (ir->bRot)
9764     {
9765         /* Update the local rotation groups */
9766         dd_make_local_rotation_groups(dd, ir->rot);
9767     }
9768
9769     if (ir->eSwapCoords != eswapNO)
9770     {
9771         /* Update the local groups needed for ion swapping */
9772         dd_make_local_swap_groups(dd, ir->swap);
9773     }
9774
9775     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9776     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9777
9778     add_dd_statistics(dd);
9779
9780     /* Make sure we only count the cycles for this DD partitioning */
9781     clear_dd_cycle_counts(dd);
9782
9783     /* Because the order of the atoms might have changed since
9784      * the last vsite construction, we need to communicate the constructing
9785      * atom coordinates again (for spreading the forces this MD step).
9786      */
9787     dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9788
9789     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9790
9791     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9792     {
9793         dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
9794         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9795                      -1, as_rvec_array(state_local->x.data()), state_local->box);
9796     }
9797
9798     /* Store the partitioning step */
9799     comm->partition_step = step;
9800
9801     /* Increase the DD partitioning counter */
9802     dd->ddp_count++;
9803     /* The state currently matches this DD partitioning count, store it */
9804     state_local->ddp_count = dd->ddp_count;
9805     if (bMasterState)
9806     {
9807         /* The DD master node knows the complete cg distribution,
9808          * store the count so we can possibly skip the cg info communication.
9809          */
9810         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9811     }
9812
9813     if (comm->DD_debug > 0)
9814     {
9815         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9816         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9817                                 "after partitioning");
9818     }
9819
9820     wallcycle_stop(wcycle, ewcDOMDEC);
9821 }