Merge branch release-4-6 into release-5-0
authorMark Abraham <mark.j.abraham@gmail.com>
Thu, 28 Aug 2014 14:22:11 +0000 (16:22 +0200)
committerRoland Schulz <roland@utk.edu>
Fri, 29 Aug 2014 04:13:44 +0000 (00:13 -0400)
Conflicts:
src/gromacs/fileio/gmxfio.c
Used new release-4-6 code calling renamed md5 functions

src/gromacs/gmxana/gmx_covar.c
Used gmx_ffclose, not ffclose, for file opened with xvgropen.

Change-Id: If6f9cb61bf7eab06dfdc61a03ad89ed38d599382

1  2 
src/gromacs/gmxana/gmx_covar.c
src/gromacs/legacyheaders/vsite.h
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/pme.c
src/gromacs/mdlib/vsite.c

index c9faa784de876a09870107c1decc63a88341790f,0000000000000000000000000000000000000000..f3e575f2288a5cbb9555b707ea9ac1fb8887ddb4
mode 100644,000000..100644
--- /dev/null
@@@ -1,660 -1,0 +1,662 @@@
-     FILE           *out;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <time.h>
 +
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +#include "gromacs/commandline/pargs.h"
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "gromacs/fileio/futil.h"
 +#include "index.h"
 +#include "gromacs/fileio/confio.h"
 +#include "gromacs/fileio/trnio.h"
 +#include "mshift.h"
 +#include "xvgr.h"
 +#include "rmpbc.h"
 +#include "txtdump.h"
 +#include "gromacs/fileio/matio.h"
 +#include "eigio.h"
 +#include "physics.h"
 +#include "gmx_ana.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/fileio/trxio.h"
 +
 +#include "gromacs/linearalgebra/eigensolver.h"
 +#include "gromacs/math/do_fit.h"
 +#include "gromacs/legacyheaders/gmx_fatal.h"
 +
 +int gmx_covar(int argc, char *argv[])
 +{
 +    const char     *desc[] = {
 +        "[THISMODULE] calculates and diagonalizes the (mass-weighted)",
 +        "covariance matrix.",
 +        "All structures are fitted to the structure in the structure file.",
 +        "When this is not a run input file periodicity will not be taken into",
 +        "account. When the fit and analysis groups are identical and the analysis",
 +        "is non mass-weighted, the fit will also be non mass-weighted.",
 +        "[PAR]",
 +        "The eigenvectors are written to a trajectory file ([TT]-v[tt]).",
 +        "When the same atoms are used for the fit and the covariance analysis,",
 +        "the reference structure for the fit is written first with t=-1.",
 +        "The average (or reference when [TT]-ref[tt] is used) structure is",
 +        "written with t=0, the eigenvectors",
 +        "are written as frames with the eigenvector number as timestamp.",
 +        "[PAR]",
 +        "The eigenvectors can be analyzed with [gmx-anaeig].",
 +        "[PAR]",
 +        "Option [TT]-ascii[tt] writes the whole covariance matrix to",
 +        "an ASCII file. The order of the elements is: x1x1, x1y1, x1z1, x1x2, ...",
 +        "[PAR]",
 +        "Option [TT]-xpm[tt] writes the whole covariance matrix to an [TT].xpm[tt] file.",
 +        "[PAR]",
 +        "Option [TT]-xpma[tt] writes the atomic covariance matrix to an [TT].xpm[tt] file,",
 +        "i.e. for each atom pair the sum of the xx, yy and zz covariances is",
 +        "written.",
 +        "[PAR]",
 +        "Note that the diagonalization of a matrix requires memory and time",
 +        "that will increase at least as fast as than the square of the number",
 +        "of atoms involved. It is easy to run out of memory, in which",
 +        "case this tool will probably exit with a 'Segmentation fault'. You",
 +        "should consider carefully whether a reduced set of atoms will meet",
 +        "your needs for lower costs."
 +    };
 +    static gmx_bool bFit = TRUE, bRef = FALSE, bM = FALSE, bPBC = TRUE;
 +    static int      end  = -1;
 +    t_pargs         pa[] = {
 +        { "-fit",  FALSE, etBOOL, {&bFit},
 +          "Fit to a reference structure"},
 +        { "-ref",  FALSE, etBOOL, {&bRef},
 +          "Use the deviation from the conformation in the structure file instead of from the average" },
 +        { "-mwa",  FALSE, etBOOL, {&bM},
 +          "Mass-weighted covariance analysis"},
 +        { "-last",  FALSE, etINT, {&end},
 +          "Last eigenvector to write away (-1 is till the last)" },
 +        { "-pbc",  FALSE,  etBOOL, {&bPBC},
 +          "Apply corrections for periodic boundary conditions" }
 +    };
-     fprintf(stderr, "\nWriting eigenvalues to %s\n", eigvalfile);
-     sprintf(str, "(%snm\\S2\\N)", bM ? "u " : "");
-     out = xvgropen(eigvalfile,
-                    "Eigenvalues of the covariance matrix",
-                    "Eigenvector index", str, oenv);
-     for (i = 0; (i < end); i++)
-     {
-         fprintf (out, "%10d %g\n", (int)i+1, eigenvalues[ndim-1-i]);
-     }
-     gmx_ffclose(out);
++    FILE           *out = NULL; /* initialization makes all compilers happy */
 +    t_trxstatus    *status;
 +    t_trxstatus    *trjout;
 +    t_topology      top;
 +    int             ePBC;
 +    t_atoms        *atoms;
 +    rvec           *x, *xread, *xref, *xav, *xproj;
 +    matrix          box, zerobox;
 +    real           *sqrtm, *mat, *eigenvalues, sum, trace, inv_nframes;
 +    real            t, tstart, tend, **mat2;
 +    real            xj, *w_rls = NULL;
 +    real            min, max, *axis;
 +    int             ntopatoms, step;
 +    int             natoms, nat, count, nframes0, nframes, nlevels;
 +    gmx_int64_t     ndim, i, j, k, l;
 +    int             WriteXref;
 +    const char     *fitfile, *trxfile, *ndxfile;
 +    const char     *eigvalfile, *eigvecfile, *averfile, *logfile;
 +    const char     *asciifile, *xpmfile, *xpmafile;
 +    char            str[STRLEN], *fitname, *ananame, *pcwd;
 +    int             d, dj, nfit;
 +    atom_id        *index, *ifit;
 +    gmx_bool        bDiffMass1, bDiffMass2;
 +    time_t          now;
 +    char            timebuf[STRLEN];
 +    t_rgb           rlo, rmi, rhi;
 +    real           *eigenvectors;
 +    output_env_t    oenv;
 +    gmx_rmpbc_t     gpbc = NULL;
 +
 +    t_filenm        fnm[] = {
 +        { efTRX, "-f",  NULL, ffREAD },
 +        { efTPS, NULL,  NULL, ffREAD },
 +        { efNDX, NULL,  NULL, ffOPTRD },
 +        { efXVG, NULL,  "eigenval", ffWRITE },
 +        { efTRN, "-v",  "eigenvec", ffWRITE },
 +        { efSTO, "-av", "average.pdb", ffWRITE },
 +        { efLOG, NULL,  "covar", ffWRITE },
 +        { efDAT, "-ascii", "covar", ffOPTWR },
 +        { efXPM, "-xpm", "covar", ffOPTWR },
 +        { efXPM, "-xpma", "covara", ffOPTWR }
 +    };
 +#define NFILE asize(fnm)
 +
 +    if (!parse_common_args(&argc, argv, PCA_CAN_TIME | PCA_TIME_UNIT | PCA_BE_NICE,
 +                           NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL, &oenv))
 +    {
 +        return 0;
 +    }
 +
 +    clear_mat(zerobox);
 +
 +    fitfile    = ftp2fn(efTPS, NFILE, fnm);
 +    trxfile    = ftp2fn(efTRX, NFILE, fnm);
 +    ndxfile    = ftp2fn_null(efNDX, NFILE, fnm);
 +    eigvalfile = ftp2fn(efXVG, NFILE, fnm);
 +    eigvecfile = ftp2fn(efTRN, NFILE, fnm);
 +    averfile   = ftp2fn(efSTO, NFILE, fnm);
 +    logfile    = ftp2fn(efLOG, NFILE, fnm);
 +    asciifile  = opt2fn_null("-ascii", NFILE, fnm);
 +    xpmfile    = opt2fn_null("-xpm", NFILE, fnm);
 +    xpmafile   = opt2fn_null("-xpma", NFILE, fnm);
 +
 +    read_tps_conf(fitfile, str, &top, &ePBC, &xref, NULL, box, TRUE);
 +    atoms = &top.atoms;
 +
 +    if (bFit)
 +    {
 +        printf("\nChoose a group for the least squares fit\n");
 +        get_index(atoms, ndxfile, 1, &nfit, &ifit, &fitname);
 +        if (nfit < 3)
 +        {
 +            gmx_fatal(FARGS, "Need >= 3 points to fit!\n");
 +        }
 +    }
 +    else
 +    {
 +        nfit = 0;
 +    }
 +    printf("\nChoose a group for the covariance analysis\n");
 +    get_index(atoms, ndxfile, 1, &natoms, &index, &ananame);
 +
 +    bDiffMass1 = FALSE;
 +    if (bFit)
 +    {
 +        snew(w_rls, atoms->nr);
 +        for (i = 0; (i < nfit); i++)
 +        {
 +            w_rls[ifit[i]] = atoms->atom[ifit[i]].m;
 +            if (i)
 +            {
 +                bDiffMass1 = bDiffMass1 || (w_rls[ifit[i]] != w_rls[ifit[i-1]]);
 +            }
 +        }
 +    }
 +    bDiffMass2 = FALSE;
 +    snew(sqrtm, natoms);
 +    for (i = 0; (i < natoms); i++)
 +    {
 +        if (bM)
 +        {
 +            sqrtm[i] = sqrt(atoms->atom[index[i]].m);
 +            if (i)
 +            {
 +                bDiffMass2 = bDiffMass2 || (sqrtm[i] != sqrtm[i-1]);
 +            }
 +        }
 +        else
 +        {
 +            sqrtm[i] = 1.0;
 +        }
 +    }
 +
 +    if (bFit && bDiffMass1 && !bDiffMass2)
 +    {
 +        bDiffMass1 = natoms != nfit;
 +        i          = 0;
 +        for (i = 0; (i < natoms) && !bDiffMass1; i++)
 +        {
 +            bDiffMass1 = index[i] != ifit[i];
 +        }
 +        if (!bDiffMass1)
 +        {
 +            fprintf(stderr, "\n"
 +                    "Note: the fit and analysis group are identical,\n"
 +                    "      while the fit is mass weighted and the analysis is not.\n"
 +                    "      Making the fit non mass weighted.\n\n");
 +            for (i = 0; (i < nfit); i++)
 +            {
 +                w_rls[ifit[i]] = 1.0;
 +            }
 +        }
 +    }
 +
 +    /* Prepare reference frame */
 +    if (bPBC)
 +    {
 +        gpbc = gmx_rmpbc_init(&top.idef, ePBC, atoms->nr);
 +        gmx_rmpbc(gpbc, atoms->nr, box, xref);
 +    }
 +    if (bFit)
 +    {
 +        reset_x(nfit, ifit, atoms->nr, NULL, xref, w_rls);
 +    }
 +
 +    snew(x, natoms);
 +    snew(xav, natoms);
 +    ndim = natoms*DIM;
 +    if (sqrt(GMX_INT64_MAX) < ndim)
 +    {
 +        gmx_fatal(FARGS, "Number of degrees of freedoms to large for matrix.\n");
 +    }
 +    snew(mat, ndim*ndim);
 +
 +    fprintf(stderr, "Calculating the average structure ...\n");
 +    nframes0 = 0;
 +    nat      = read_first_x(oenv, &status, trxfile, &t, &xread, box);
 +    if (nat != atoms->nr)
 +    {
 +        fprintf(stderr, "\nWARNING: number of atoms in tpx (%d) and trajectory (%d) do not match\n", natoms, nat);
 +    }
 +    do
 +    {
 +        nframes0++;
 +        /* calculate x: a fitted struture of the selected atoms */
 +        if (bPBC)
 +        {
 +            gmx_rmpbc(gpbc, nat, box, xread);
 +        }
 +        if (bFit)
 +        {
 +            reset_x(nfit, ifit, nat, NULL, xread, w_rls);
 +            do_fit(nat, w_rls, xref, xread);
 +        }
 +        for (i = 0; i < natoms; i++)
 +        {
 +            rvec_inc(xav[i], xread[index[i]]);
 +        }
 +    }
 +    while (read_next_x(oenv, status, &t, xread, box));
 +    close_trj(status);
 +
 +    inv_nframes = 1.0/nframes0;
 +    for (i = 0; i < natoms; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            xav[i][d]         *= inv_nframes;
 +            xread[index[i]][d] = xav[i][d];
 +        }
 +    }
 +    write_sto_conf_indexed(opt2fn("-av", NFILE, fnm), "Average structure",
 +                           atoms, xread, NULL, epbcNONE, zerobox, natoms, index);
 +    sfree(xread);
 +
 +    fprintf(stderr, "Constructing covariance matrix (%dx%d) ...\n", (int)ndim, (int)ndim);
 +    nframes = 0;
 +    nat     = read_first_x(oenv, &status, trxfile, &t, &xread, box);
 +    tstart  = t;
 +    do
 +    {
 +        nframes++;
 +        tend = t;
 +        /* calculate x: a (fitted) structure of the selected atoms */
 +        if (bPBC)
 +        {
 +            gmx_rmpbc(gpbc, nat, box, xread);
 +        }
 +        if (bFit)
 +        {
 +            reset_x(nfit, ifit, nat, NULL, xread, w_rls);
 +            do_fit(nat, w_rls, xref, xread);
 +        }
 +        if (bRef)
 +        {
 +            for (i = 0; i < natoms; i++)
 +            {
 +                rvec_sub(xread[index[i]], xref[index[i]], x[i]);
 +            }
 +        }
 +        else
 +        {
 +            for (i = 0; i < natoms; i++)
 +            {
 +                rvec_sub(xread[index[i]], xav[i], x[i]);
 +            }
 +        }
 +
 +        for (j = 0; j < natoms; j++)
 +        {
 +            for (dj = 0; dj < DIM; dj++)
 +            {
 +                k  = ndim*(DIM*j+dj);
 +                xj = x[j][dj];
 +                for (i = j; i < natoms; i++)
 +                {
 +                    l = k+DIM*i;
 +                    for (d = 0; d < DIM; d++)
 +                    {
 +                        mat[l+d] += x[i][d]*xj;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    while (read_next_x(oenv, status, &t, xread, box) &&
 +           (bRef || nframes < nframes0));
 +    close_trj(status);
 +    gmx_rmpbc_done(gpbc);
 +
 +    fprintf(stderr, "Read %d frames\n", nframes);
 +
 +    if (bRef)
 +    {
 +        /* copy the reference structure to the ouput array x */
 +        snew(xproj, natoms);
 +        for (i = 0; i < natoms; i++)
 +        {
 +            copy_rvec(xref[index[i]], xproj[i]);
 +        }
 +    }
 +    else
 +    {
 +        xproj = xav;
 +    }
 +
 +    /* correct the covariance matrix for the mass */
 +    inv_nframes = 1.0/nframes;
 +    for (j = 0; j < natoms; j++)
 +    {
 +        for (dj = 0; dj < DIM; dj++)
 +        {
 +            for (i = j; i < natoms; i++)
 +            {
 +                k = ndim*(DIM*j+dj)+DIM*i;
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    mat[k+d] = mat[k+d]*inv_nframes*sqrtm[i]*sqrtm[j];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* symmetrize the matrix */
 +    for (j = 0; j < ndim; j++)
 +    {
 +        for (i = j; i < ndim; i++)
 +        {
 +            mat[ndim*i+j] = mat[ndim*j+i];
 +        }
 +    }
 +
 +    trace = 0;
 +    for (i = 0; i < ndim; i++)
 +    {
 +        trace += mat[i*ndim+i];
 +    }
 +    fprintf(stderr, "\nTrace of the covariance matrix: %g (%snm^2)\n",
 +            trace, bM ? "u " : "");
 +
 +    if (asciifile)
 +    {
 +        out = gmx_ffopen(asciifile, "w");
 +        for (j = 0; j < ndim; j++)
 +        {
 +            for (i = 0; i < ndim; i += 3)
 +            {
 +                fprintf(out, "%g %g %g\n",
 +                        mat[ndim*j+i], mat[ndim*j+i+1], mat[ndim*j+i+2]);
 +            }
 +        }
 +        gmx_ffclose(out);
 +    }
 +
 +    if (xpmfile)
 +    {
 +        min = 0;
 +        max = 0;
 +        snew(mat2, ndim);
 +        for (j = 0; j < ndim; j++)
 +        {
 +            mat2[j] = &(mat[ndim*j]);
 +            for (i = 0; i <= j; i++)
 +            {
 +                if (mat2[j][i] < min)
 +                {
 +                    min = mat2[j][i];
 +                }
 +                if (mat2[j][j] > max)
 +                {
 +                    max = mat2[j][i];
 +                }
 +            }
 +        }
 +        snew(axis, ndim);
 +        for (i = 0; i < ndim; i++)
 +        {
 +            axis[i] = i+1;
 +        }
 +        rlo.r   = 0; rlo.g = 0; rlo.b = 1;
 +        rmi.r   = 1; rmi.g = 1; rmi.b = 1;
 +        rhi.r   = 1; rhi.g = 0; rhi.b = 0;
 +        out     = gmx_ffopen(xpmfile, "w");
 +        nlevels = 80;
 +        write_xpm3(out, 0, "Covariance", bM ? "u nm^2" : "nm^2",
 +                   "dim", "dim", ndim, ndim, axis, axis,
 +                   mat2, min, 0.0, max, rlo, rmi, rhi, &nlevels);
 +        gmx_ffclose(out);
 +        sfree(axis);
 +        sfree(mat2);
 +    }
 +
 +    if (xpmafile)
 +    {
 +        min = 0;
 +        max = 0;
 +        snew(mat2, ndim/DIM);
 +        for (i = 0; i < ndim/DIM; i++)
 +        {
 +            snew(mat2[i], ndim/DIM);
 +        }
 +        for (j = 0; j < ndim/DIM; j++)
 +        {
 +            for (i = 0; i <= j; i++)
 +            {
 +                mat2[j][i] = 0;
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    mat2[j][i] += mat[ndim*(DIM*j+d)+DIM*i+d];
 +                }
 +                if (mat2[j][i] < min)
 +                {
 +                    min = mat2[j][i];
 +                }
 +                if (mat2[j][j] > max)
 +                {
 +                    max = mat2[j][i];
 +                }
 +                mat2[i][j] = mat2[j][i];
 +            }
 +        }
 +        snew(axis, ndim/DIM);
 +        for (i = 0; i < ndim/DIM; i++)
 +        {
 +            axis[i] = i+1;
 +        }
 +        rlo.r   = 0; rlo.g = 0; rlo.b = 1;
 +        rmi.r   = 1; rmi.g = 1; rmi.b = 1;
 +        rhi.r   = 1; rhi.g = 0; rhi.b = 0;
 +        out     = gmx_ffopen(xpmafile, "w");
 +        nlevels = 80;
 +        write_xpm3(out, 0, "Covariance", bM ? "u nm^2" : "nm^2",
 +                   "atom", "atom", ndim/DIM, ndim/DIM, axis, axis,
 +                   mat2, min, 0.0, max, rlo, rmi, rhi, &nlevels);
 +        gmx_ffclose(out);
 +        sfree(axis);
 +        for (i = 0; i < ndim/DIM; i++)
 +        {
 +            sfree(mat2[i]);
 +        }
 +        sfree(mat2);
 +    }
 +
 +
 +    /* call diagonalization routine */
 +
 +    snew(eigenvalues, ndim);
 +    snew(eigenvectors, ndim*ndim);
 +
 +    memcpy(eigenvectors, mat, ndim*ndim*sizeof(real));
 +    fprintf(stderr, "\nDiagonalizing ...\n");
 +    fflush(stderr);
 +    eigensolver(eigenvectors, ndim, 0, ndim, eigenvalues, mat);
 +    sfree(eigenvectors);
 +
 +    /* now write the output */
 +
 +    sum = 0;
 +    for (i = 0; i < ndim; i++)
 +    {
 +        sum += eigenvalues[i];
 +    }
 +    fprintf(stderr, "\nSum of the eigenvalues: %g (%snm^2)\n",
 +            sum, bM ? "u " : "");
 +    if (fabs(trace-sum) > 0.01*trace)
 +    {
 +        fprintf(stderr, "\nWARNING: eigenvalue sum deviates from the trace of the covariance matrix\n");
 +    }
 +
++    /* Set 'end', the maximum eigenvector and -value index used for output */
 +    if (end == -1)
 +    {
 +        if (nframes-1 < ndim)
 +        {
 +            end = nframes-1;
 +            fprintf(out, "WARNING: there are fewer frames in your trajectory than there are\n");
 +            fprintf(out, "degrees of freedom in your system. Only generating the first\n");
 +            fprintf(out, "%d out of %d eigenvectors and eigenvalues.\n", end, (int)ndim);
 +        }
 +        else
 +        {
 +            end = ndim;
 +        }
 +    }
++
++    fprintf(stderr, "\nWriting eigenvalues to %s\n", eigvalfile);
++
++    sprintf(str, "(%snm\\S2\\N)", bM ? "u " : "");
++    out = xvgropen(eigvalfile,
++                   "Eigenvalues of the covariance matrix",
++                   "Eigenvector index", str, oenv);
++    for (i = 0; (i < end); i++)
++    {
++        fprintf (out, "%10d %g\n", (int)i+1, eigenvalues[ndim-1-i]);
++    }
++    gmx_ffclose(out);
++
 +    if (bFit)
 +    {
 +        /* misuse lambda: 0/1 mass weighted analysis no/yes */
 +        if (nfit == natoms)
 +        {
 +            WriteXref = eWXR_YES;
 +            for (i = 0; i < nfit; i++)
 +            {
 +                copy_rvec(xref[ifit[i]], x[i]);
 +            }
 +        }
 +        else
 +        {
 +            WriteXref = eWXR_NO;
 +        }
 +    }
 +    else
 +    {
 +        /* misuse lambda: -1 for no fit */
 +        WriteXref = eWXR_NOFIT;
 +    }
 +
 +    write_eigenvectors(eigvecfile, natoms, mat, TRUE, 1, end,
 +                       WriteXref, x, bDiffMass1, xproj, bM, eigenvalues);
 +
 +    out = gmx_ffopen(logfile, "w");
 +
 +    time(&now);
 +    gmx_ctime_r(&now, timebuf, STRLEN);
 +    fprintf(out, "Covariance analysis log, written %s\n", timebuf);
 +
 +    fprintf(out, "Program: %s\n", argv[0]);
 +    gmx_getcwd(str, STRLEN);
 +
 +    fprintf(out, "Working directory: %s\n\n", str);
 +
 +    fprintf(out, "Read %d frames from %s (time %g to %g %s)\n", nframes, trxfile,
 +            output_env_conv_time(oenv, tstart), output_env_conv_time(oenv, tend), output_env_get_time_unit(oenv));
 +    if (bFit)
 +    {
 +        fprintf(out, "Read reference structure for fit from %s\n", fitfile);
 +    }
 +    if (ndxfile)
 +    {
 +        fprintf(out, "Read index groups from %s\n", ndxfile);
 +    }
 +    fprintf(out, "\n");
 +
 +    fprintf(out, "Analysis group is '%s' (%d atoms)\n", ananame, natoms);
 +    if (bFit)
 +    {
 +        fprintf(out, "Fit group is '%s' (%d atoms)\n", fitname, nfit);
 +    }
 +    else
 +    {
 +        fprintf(out, "No fit was used\n");
 +    }
 +    fprintf(out, "Analysis is %smass weighted\n", bDiffMass2 ? "" : "non-");
 +    if (bFit)
 +    {
 +        fprintf(out, "Fit is %smass weighted\n", bDiffMass1 ? "" : "non-");
 +    }
 +    fprintf(out, "Diagonalized the %dx%d covariance matrix\n", (int)ndim, (int)ndim);
 +    fprintf(out, "Trace of the covariance matrix before diagonalizing: %g\n",
 +            trace);
 +    fprintf(out, "Trace of the covariance matrix after diagonalizing: %g\n\n",
 +            sum);
 +
 +    fprintf(out, "Wrote %d eigenvalues to %s\n", (int)end, eigvalfile);
 +    if (WriteXref == eWXR_YES)
 +    {
 +        fprintf(out, "Wrote reference structure to %s\n", eigvecfile);
 +    }
 +    fprintf(out, "Wrote average structure to %s and %s\n", averfile, eigvecfile);
 +    fprintf(out, "Wrote eigenvectors %d to %d to %s\n", 1, end, eigvecfile);
 +
 +    gmx_ffclose(out);
 +
 +    fprintf(stderr, "Wrote the log to %s\n", logfile);
 +
 +    return 0;
 +}
index 2ccc6ed1f9988681711f79259165249ad60ca049,0000000000000000000000000000000000000000..d6b453e3b236cb1a5106118511a463331c7d4e52
mode 100644,000000..100644
--- /dev/null
@@@ -1,131 -1,0 +1,132 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +#ifndef _vsite_h
 +#define _vsite_h
 +
 +#include <stdio.h>
 +#include "typedefs.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +typedef struct {
 +    t_ilist ilist[F_NRE];     /* vsite ilists for this thread            */
 +    rvec    fshift[SHIFTS];   /* fshift accumulation buffer              */
 +    matrix  dxdf;             /* virial dx*df accumulation buffer        */
 +} gmx_vsite_thread_t;
 +
 +typedef struct {
 +    gmx_bool            bHaveChargeGroups;    /* Do we have charge groups?               */
 +    int                 n_intercg_vsite;      /* The number of inter charge group vsites */
 +    int                 nvsite_pbc_molt;      /* The array size of vsite_pbc_molt        */
 +    int              ***vsite_pbc_molt;       /* The pbc atoms for intercg vsites        */
 +    int               **vsite_pbc_loc;        /* The local pbc atoms                     */
 +    int                *vsite_pbc_loc_nalloc; /* Sizes of vsite_pbc_loc                  */
 +    int                 nthreads;             /* Number of threads used for vsites       */
 +    gmx_vsite_thread_t *tdata;                /* Thread local vsites and work structs    */
 +    int                *th_ind;               /* Work array                              */
 +    int                 th_ind_nalloc;        /* Size of th_ind                          */
 +} gmx_vsite_t;
 +
 +void construct_vsites(gmx_vsite_t *vsite,
 +                      rvec x[],
 +                      real dt, rvec v[],
 +                      t_iparams ip[], t_ilist ilist[],
 +                      int ePBC, gmx_bool bMolPBC,
 +                      t_commrec *cr, matrix box);
 +/* Create positions of vsite atoms based on surrounding atoms
 + * for the local system.
 + * If v is passed, the velocities of the vsites will be calculated
 + * as the new positions minus the old positions divided by dt,
 + * thus v should only be passed when the coordinates have been
 + * updated with a full time step.
 + * Note that velocitis of vsites are completely irrelevant
 + * for the integration, they are only useful for analysis.
 + */
 +
 +void construct_vsites_mtop(gmx_vsite_t *vsite,
 +                           gmx_mtop_t *mtop, rvec x[]);
 +/* Create positions of vsite atoms based on surrounding atoms
 + * for the whole system.
 + * This function assumes that all molecules are whole.
 + */
 +
 +void spread_vsite_f(gmx_vsite_t *vsite,
 +                    rvec x[], rvec f[], rvec *fshift,
 +                    gmx_bool VirCorr, matrix vir,
 +                    t_nrnb *nrnb, t_idef *idef,
 +                    int ePBC, gmx_bool bMolPBC, t_graph *g, matrix box,
 +                    t_commrec *cr);
 +/* Spread the force operating on the vsite atoms on the surrounding atoms.
 + * If fshift!=NULL also update the shift forces.
 + * If VirCorr=TRUE add the virial correction for non-linear vsite constructs
 + * to vir. This correction is required when the virial is not calculated
 + * afterwards from the particle position and forces, but in a different way,
 + * as for instance for the PME mesh contribution.
 + */
 +
 +gmx_vsite_t *init_vsite(gmx_mtop_t *mtop, t_commrec *cr,
 +                        gmx_bool bSerial_NoPBC);
 +/* Initialize the virtual site struct,
 + * returns NULL when there are no virtual sites.
 + * bSerial_NoPBC is to generate a simple vsite setup to be
 + * used only serial (no MPI or thread parallelization) and without pbc;
 + * this is useful for correction vsites of the initial configuration.
 + */
 +
 +void split_vsites_over_threads(const t_ilist   *ilist,
++                               const t_iparams *ip,
 +                               const t_mdatoms *mdatoms,
 +                               gmx_bool         bLimitRange,
 +                               gmx_vsite_t     *vsite);
 +/* Divide the vsite work-load over the threads.
 + * Should be called at the end of the domain decomposition.
 + */
 +
 +void set_vsite_top(gmx_vsite_t *vsite, gmx_localtop_t *top, t_mdatoms *md,
 +                   t_commrec *cr);
 +/* Set some vsite data for runs without domain decomposition.
 + * Should be called once after init_vsite, before calling other routines.
 + */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index ae2a4f8098b696a91f846ef2e483abcfbc01b82f,0000000000000000000000000000000000000000..401b6effdbb07ac88670d72211724ca19a907e1a
mode 100644,000000..100644
--- /dev/null
@@@ -1,9848 -1,0 +1,9849 @@@
-         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmx_ga2la.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gpu_utils.h"
 +
 +#include "gromacs/fileio/futil.h"
 +#include "gromacs/fileio/gmxfio.h"
 +#include "gromacs/fileio/pdbio.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/swap/swapcoords.h"
 +#include "gromacs/utility/qsort_threadsafe.h"
 +#include "gromacs/pulling/pull.h"
 +#include "gromacs/pulling/pull_rotation.h"
 +#include "gromacs/imd/imd.h"
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +    int                nrank_gpu_shared;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +    MPI_Comm           mpi_comm_gpu_shared;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_int64_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Turn on DLB when the load imbalance causes this amount of total loss.
 + * There is a bit of overhead with DLB and it's difficult to achieve
 + * a load imbalance of less than 2% with DLB.
 + */
 +#define DD_PERF_LOSS_DLB_ON  0.02
 +
 +/* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 +#define DD_PERF_LOSS_WARN    0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
 +{
 +    int i;
 +    dd_bcast(dd, sizeof(int), &dfhist->bEquil);
 +    dd_bcast(dd, sizeof(int), &dfhist->nlambda);
 +    dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
 +
 +    if (dfhist->nlambda > 0)
 +    {
 +        int nlam = dfhist->nlambda;
 +        dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
 +
 +        for (i = 0; i < nlam; i++)
 +        {
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
 +        }
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        copy_df_history(&state_local->dfhist, &state->dfhist);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    /* communicate df_history -- required for restarting from checkpoint */
 +    dd_distribute_dfhist(dd, &state_local->dfhist);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
 +                                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
 +                                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +void get_pme_nnodes(const gmx_domdec_t *dd,
 +                    int *npmenodes_x, int *npmenodes_y)
 +{
 +    if (dd != NULL)
 +    {
 +        *npmenodes_x = dd->comm->npmenodes_x;
 +        *npmenodes_y = dd->comm->npmenodes_y;
 +    }
 +    else
 +    {
 +        *npmenodes_x = 1;
 +        *npmenodes_y = 1;
 +    }
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP ranks:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +    /* zone_ncg1[0] should always be equal to ncg_home */
 +    dd->comm->zone_ncg1[0] = dd->ncg_home;
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD rank %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD rank %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        /* The cut-off might have changed, e.g. by PME load balacning,
 +         * from the value used to set comm->cellsize_min, so check it.
 +         */
 +        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            /* Check for the cut-off limit set by the PME load balancing */
 +            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_int64_t     step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other sources,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +
 +#ifdef GMX_MPI
 +        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 +        {
 +            float gpu_wait, gpu_wait_sum;
 +
 +            gpu_wait = comm->cycl[ddCyclWaitGPU];
 +            if (comm->cycl_n[ddCyclF] > 1)
 +            {
 +                /* We should remove the WaitGPU time of the same MD step
 +                 * as the one with the maximum F time, since the F time
 +                 * and the wait time are not independent.
 +                 * Furthermore, the step for the max F time should be chosen
 +                 * the same on all ranks that share the same GPU.
 +                 * But to keep the code simple, we remove the average instead.
 +                 * The main reason for artificially long times at some steps
 +                 * is spurious CPU activity or MPI time, so we don't expect
 +                 * that changes in the GPU wait time matter a lot here.
 +                 */
 +                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
 +            }
 +            /* Sum the wait times over the ranks that share the same GPU */
 +            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 +                          comm->mpi_comm_gpu_shared);
 +            /* Replace the wait time by the average over the ranks */
 +            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 +        }
 +#endif
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +enum {
 +    setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
 +};
 +
 +/* Set the domain boundaries. Use for static (or no) load balancing,
 + * and also for the starting state for dynamic load balancing.
 + * setmode determine if and where the boundaries are stored, use enum above.
 + * Returns the number communication pulses in npulse.
 + */
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                                  int setmode, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            switch (setmode)
 +            {
 +                case setcellsizeslbMASTER:
 +                    for (j = 0; j < dd->nc[d]+1; j++)
 +                    {
 +                        dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                    }
 +                    break;
 +                case setcellsizeslbLOCAL:
 +                    comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                    comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +                    break;
 +                default:
 +                    break;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (setmode == setcellsizeslbMASTER)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
 +            if (setmode == setcellsizeslbLOCAL)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +            }
 +            if (setmode != setcellsizeslbMASTER)
 +            {
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            char error_string[STRLEN];
 +
 +            sprintf(error_string,
 +                    "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                    dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
 +                    comm->cutoff,
 +                    dd->nc[d], dd->nc[d],
 +                    dd->nnodes > dd->nc[d] ? "cells" : "ranks");
 +
 +            if (setmode == setcellsizeslbLOCAL)
 +            {
 +                gmx_fatal_collective(FARGS, NULL, dd, error_string);
 +            }
 +            else
 +            {
 +                gmx_fatal(FARGS, error_string);
 +            }
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_int64_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_int64_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 != d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_int64_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_int64_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
 +
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
 +        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_int64_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_int64_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_int64_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS_WARN)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME ranks\n"
 +                    "      had %s work to do than the PP ranks.\n"
 +                    "      You might want to %s the number of PME ranks\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
 +                                   const gmx_hw_info_t gmx_unused *hwinfo,
 +                                   const gmx_hw_opt_t  gmx_unused *hw_opt)
 +{
 +#ifdef GMX_MPI
 +    int           physicalnode_id_hash;
 +    int           gpu_id;
 +    gmx_domdec_t *dd;
 +    MPI_Comm      mpi_comm_pp_physicalnode;
 +
 +    if (!(cr->duty & DUTY_PP) ||
 +        hw_opt->gpu_opt.ncuda_dev_use == 0)
 +    {
 +        /* Only PP nodes (currently) use GPUs.
 +         * If we don't have GPUs, there are no resources to share.
 +         */
 +        return;
 +    }
 +
 +    physicalnode_id_hash = gmx_physicalnode_id_hash();
 +
 +    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
 +
 +    dd = cr->dd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
 +        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
 +                dd->rank, physicalnode_id_hash, gpu_id);
 +    }
 +    /* Split the PP communicator over the physical nodes */
 +    /* TODO: See if we should store this (before), as it's also used for
 +     * for the nodecomm summution.
 +     */
 +    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
 +                   &mpi_comm_pp_physicalnode);
 +    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
 +                   &dd->comm->mpi_comm_gpu_shared);
 +    MPI_Comm_free(&mpi_comm_pp_physicalnode);
 +    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
 +    }
 +
 +    /* Note that some ranks could share a GPU, while others don't */
 +
 +    if (dd->comm->nrank_gpu_shared == 1)
 +    {
 +        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
 +    }
 +#endif
 +}
 +
 +static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition rank %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition rank %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
 +                               int gmx_unused reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the ranks: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME ranks\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This rank does only %s work.\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_getenv(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
 +    comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
 +    recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
 +    comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
 +    comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    /* Initialize to GPU share count to 0, might change later */
 +    comm->nrank_gpu_shared = 0;
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME ranks without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            assert(ddbox->npbcdim <= DIM);
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_int64_t          step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_int64_t        step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
 +    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +        ncgindex_set = 0;
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
 +        ncgindex_set = dd->ncg_home;
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
 +        ncgindex_set = 0;
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr,
 +                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        ga2la_clear(dd->ga2la);
 +        ncgindex_set = 0;
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
 +    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
++        split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
++                                  mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
 +    setup_bonded_threading(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges and/or c6/sigmas to our PME only node */
 +        gmx_pme_send_parameters(cr,
 +                                fr->ic,
 +                                mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
 +                                mdatoms->chargeA, mdatoms->chargeB,
 +                                mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
 +                                mdatoms->sigmaA, mdatoms->sigmaB,
 +                                dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +    if (ir->eSwapCoords != eswapNO)
 +    {
 +        /* Update the local groups needed for ion swapping */
 +        dd_make_local_swap_groups(dd, ir->swap);
 +    }
 +
 +    /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
 +    dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index 532fa3a170e91d0fbca6bfd5b7d3179e16ca2edc,0000000000000000000000000000000000000000..d070abdb597bccfcfa30ad19309601697d7766d1
mode 100644,000000..100644
--- /dev/null
@@@ -1,5348 -1,0 +1,5384 @@@
-          * Because grid overlap communication only goes forward,
-          * the grid the slabs for fft's should be rounded down.
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +/* IMPORTANT FOR DEVELOPERS:
 + *
 + * Triclinic pme stuff isn't entirely trivial, and we've experienced
 + * some bugs during development (many of them due to me). To avoid
 + * this in the future, please check the following things if you make
 + * changes in this file:
 + *
 + * 1. You should obtain identical (at least to the PME precision)
 + *    energies, forces, and virial for
 + *    a rectangular box and a triclinic one where the z (or y) axis is
 + *    tilted a whole box side. For instance you could use these boxes:
 + *
 + *    rectangular       triclinic
 + *     2  0  0           2  0  0
 + *     0  2  0           0  2  0
 + *     0  0  6           2  2  6
 + *
 + * 2. You should check the energy conservation in a triclinic box.
 + *
 + * It might seem an overkill, but better safe than sorry.
 + * /Erik 001109
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include <assert.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "coulomb.h"
 +#include "gmx_fatal.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "physics.h"
 +#include "nrnb.h"
 +#include "macros.h"
 +
 +#include "gromacs/fft/parallel_3dfft.h"
 +#include "gromacs/fileio/futil.h"
 +#include "gromacs/fileio/pdbio.h"
 +#include "gromacs/math/gmxcomplex.h"
 +#include "gromacs/timing/cyclecounter.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/utility/gmxomp.h"
 +
 +/* Include the SIMD macro file and then check for support */
 +#include "gromacs/simd/simd.h"
 +#include "gromacs/simd/simd_math.h"
 +#ifdef GMX_SIMD_HAVE_REAL
 +/* Turn on arbitrary width SIMD intrinsics for PME solve */
 +#    define PME_SIMD_SOLVE
 +#endif
 +
 +#define PME_GRID_QA    0 /* Gridindex for A-state for Q */
 +#define PME_GRID_C6A   2 /* Gridindex for A-state for LJ */
 +#define DO_Q           2 /* Electrostatic grids have index q<2 */
 +#define DO_Q_AND_LJ    4 /* non-LB LJ grids have index 2 <= q < 4 */
 +#define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
 +
 +/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
 +const real lb_scale_factor[] = {
 +    1.0/64, 6.0/64, 15.0/64, 20.0/64,
 +    15.0/64, 6.0/64, 1.0/64
 +};
 +
 +/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
 +const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
 +
 +/* Check if we have 4-wide SIMD macro support */
 +#if (defined GMX_SIMD4_HAVE_REAL)
 +/* Do PME spread and gather with 4-wide SIMD.
 + * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
 + */
 +#    define PME_SIMD4_SPREAD_GATHER
 +
 +#    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
 +/* With PME-order=4 on x86, unaligned load+store is slightly faster
 + * than doubling all SIMD operations when using aligned load+store.
 + */
 +#        define PME_SIMD4_UNALIGNED
 +#    endif
 +#endif
 +
 +#define DFT_TOL 1e-7
 +/* #define PRT_FORCE */
 +/* conditions for on the fly time-measurement */
 +/* #define TAKETIME (step > 1 && timesteps < 10) */
 +#define TAKETIME FALSE
 +
 +/* #define PME_TIME_THREADS */
 +
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +#    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
 +#else
 +/* We can use any alignment, apart from 0, so we use 4 reals */
 +#    define SIMD4_ALIGNMENT  (4*sizeof(real))
 +#endif
 +
 +/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
 + * to preserve alignment.
 + */
 +#define GMX_CACHE_SEP 64
 +
 +/* We only define a maximum to be able to use local arrays without allocation.
 + * An order larger than 12 should never be needed, even for test cases.
 + * If needed it can be changed here.
 + */
 +#define PME_ORDER_MAX 12
 +
 +/* Internal datastructures */
 +typedef struct {
 +    int send_index0;
 +    int send_nindex;
 +    int recv_index0;
 +    int recv_nindex;
 +    int recv_size;   /* Receive buffer width, used with OpenMP */
 +} pme_grid_comm_t;
 +
 +typedef struct {
 +#ifdef GMX_MPI
 +    MPI_Comm         mpi_comm;
 +#endif
 +    int              nnodes, nodeid;
 +    int             *s2g0;
 +    int             *s2g1;
 +    int              noverlap_nodes;
 +    int             *send_id, *recv_id;
 +    int              send_size; /* Send buffer width, used with OpenMP */
 +    pme_grid_comm_t *comm_data;
 +    real            *sendbuf;
 +    real            *recvbuf;
 +} pme_overlap_t;
 +
 +typedef struct {
 +    int *n;      /* Cumulative counts of the number of particles per thread */
 +    int  nalloc; /* Allocation size of i */
 +    int *i;      /* Particle indices ordered on thread index (n) */
 +} thread_plist_t;
 +
 +typedef struct {
 +    int      *thread_one;
 +    int       n;
 +    int      *ind;
 +    splinevec theta;
 +    real     *ptr_theta_z;
 +    splinevec dtheta;
 +    real     *ptr_dtheta_z;
 +} splinedata_t;
 +
 +typedef struct {
 +    int      dimind;        /* The index of the dimension, 0=x, 1=y */
 +    int      nslab;
 +    int      nodeid;
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +
 +    int     *node_dest;     /* The nodes to send x and q to with DD */
 +    int     *node_src;      /* The nodes to receive x and q from with DD */
 +    int     *buf_index;     /* Index for commnode into the buffers */
 +
 +    int      maxshift;
 +
 +    int      npd;
 +    int      pd_nalloc;
 +    int     *pd;
 +    int     *count;         /* The number of atoms to send to each node */
 +    int    **count_thread;
 +    int     *rcount;        /* The number of atoms to receive */
 +
 +    int      n;
 +    int      nalloc;
 +    rvec    *x;
 +    real    *coefficient;
 +    rvec    *f;
 +    gmx_bool bSpread;       /* These coordinates are used for spreading */
 +    int      pme_order;
 +    ivec    *idx;
 +    rvec    *fractx;            /* Fractional coordinate relative to
 +                                 * the lower cell boundary
 +                                 */
 +    int             nthread;
 +    int            *thread_idx; /* Which thread should spread which coefficient */
 +    thread_plist_t *thread_plist;
 +    splinedata_t   *spline;
 +} pme_atomcomm_t;
 +
 +#define FLBS  3
 +#define FLBSZ 4
 +
 +typedef struct {
 +    ivec  ci;     /* The spatial location of this grid         */
 +    ivec  n;      /* The used size of *grid, including order-1 */
 +    ivec  offset; /* The grid offset from the full node grid   */
 +    int   order;  /* PME spreading order                       */
 +    ivec  s;      /* The allocated size of *grid, s >= n       */
 +    real *grid;   /* The grid local thread, size n             */
 +} pmegrid_t;
 +
 +typedef struct {
 +    pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 +    int        nthread;      /* The number of threads operating on this grid     */
 +    ivec       nc;           /* The local spatial decomposition over the threads */
 +    pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 +    real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 +    int      **g2t;          /* The grid to thread index                         */
 +    ivec       nthread_comm; /* The number of threads to communicate with        */
 +} pmegrids_t;
 +
 +typedef struct {
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +    /* Masks for 4-wide SIMD aligned spreading and gathering */
 +    gmx_simd4_bool_t mask_S0[6], mask_S1[6];
 +#else
 +    int              dummy; /* C89 requires that struct has at least one member */
 +#endif
 +} pme_spline_work_t;
 +
 +typedef struct {
 +    /* work data for solve_pme */
 +    int      nalloc;
 +    real *   mhx;
 +    real *   mhy;
 +    real *   mhz;
 +    real *   m2;
 +    real *   denom;
 +    real *   tmp1_alloc;
 +    real *   tmp1;
 +    real *   tmp2;
 +    real *   eterm;
 +    real *   m2inv;
 +
 +    real     energy_q;
 +    matrix   vir_q;
 +    real     energy_lj;
 +    matrix   vir_lj;
 +} pme_work_t;
 +
 +typedef struct gmx_pme {
 +    int           ndecompdim; /* The number of decomposition dimensions */
 +    int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 +    int           nodeid_major;
 +    int           nodeid_minor;
 +    int           nnodes;    /* The number of nodes doing PME */
 +    int           nnodes_major;
 +    int           nnodes_minor;
 +
 +    MPI_Comm      mpi_comm;
 +    MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 +#ifdef GMX_MPI
 +    MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 +#endif
 +
 +    gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
 +    int        nthread;       /* The number of threads doing PME on our rank */
 +
 +    gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 +    gmx_bool   bFEP;          /* Compute Free energy contribution */
 +    gmx_bool   bFEP_q;
 +    gmx_bool   bFEP_lj;
 +    int        nkx, nky, nkz; /* Grid dimensions */
 +    gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 +    int        pme_order;
 +    real       epsilon_r;
 +
 +    int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
 +
 +    int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
 +
 +    pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
 +                                         * includes overlap Grid indices are ordered as
 +                                         * follows:
 +                                         * 0: Coloumb PME, state A
 +                                         * 1: Coloumb PME, state B
 +                                         * 2-8: LJ-PME
 +                                         * This can probably be done in a better way
 +                                         * but this simple hack works for now
 +                                         */
 +    /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
 +    int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 +    /* pmegrid_nz might be larger than strictly necessary to ensure
 +     * memory alignment, pmegrid_nz_base gives the real base size.
 +     */
 +    int     pmegrid_nz_base;
 +    /* The local PME grid starting indices */
 +    int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 +
 +    /* Work data for spreading and gathering */
 +    pme_spline_work_t     *spline_work;
 +
 +    real                 **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 +    /* inside the interpolation grid, but separate for 2D PME decomp. */
 +    int                    fftgrid_nx, fftgrid_ny, fftgrid_nz;
 +
 +    t_complex            **cfftgrid;  /* Grids for complex FFT data */
 +
 +    int                    cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 +
 +    gmx_parallel_3dfft_t  *pfft_setup;
 +
 +    int                   *nnx, *nny, *nnz;
 +    real                  *fshx, *fshy, *fshz;
 +
 +    pme_atomcomm_t         atc[2]; /* Indexed on decomposition index */
 +    matrix                 recipbox;
 +    splinevec              bsp_mod;
 +    /* Buffers to store data for local atoms for L-B combination rule
 +     * calculations in LJ-PME. lb_buf1 stores either the coefficients
 +     * for spreading/gathering (in serial), or the C6 coefficient for
 +     * local atoms (in parallel).  lb_buf2 is only used in parallel,
 +     * and stores the sigma values for local atoms. */
 +    real                 *lb_buf1, *lb_buf2;
 +    int                   lb_buf_nalloc; /* Allocation size for the above buffers. */
 +
 +    pme_overlap_t         overlap[2];    /* Indexed on dimension, 0=x, 1=y */
 +
 +    pme_atomcomm_t        atc_energy;    /* Only for gmx_pme_calc_energy */
 +
 +    rvec                 *bufv;          /* Communication buffer */
 +    real                 *bufr;          /* Communication buffer */
 +    int                   buf_nalloc;    /* The communication buffer size */
 +
 +    /* thread local work data for solve_pme */
 +    pme_work_t *work;
 +
 +    /* Work data for sum_qgrid */
 +    real *   sum_qgrid_tmp;
 +    real *   sum_qgrid_dd_tmp;
 +} t_gmx_pme;
 +
 +static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                                   int start, int grid_index, int end, int thread)
 +{
 +    int             i;
 +    int            *idxptr, tix, tiy, tiz;
 +    real           *xptr, *fptr, tx, ty, tz;
 +    real            rxx, ryx, ryy, rzx, rzy, rzz;
 +    int             nx, ny, nz;
 +    int             start_ix, start_iy, start_iz;
 +    int            *g2tx, *g2ty, *g2tz;
 +    gmx_bool        bThreads;
 +    int            *thread_idx = NULL;
 +    thread_plist_t *tpl        = NULL;
 +    int            *tpl_n      = NULL;
 +    int             thread_i;
 +
 +    nx  = pme->nkx;
 +    ny  = pme->nky;
 +    nz  = pme->nkz;
 +
 +    start_ix = pme->pmegrid_start_ix;
 +    start_iy = pme->pmegrid_start_iy;
 +    start_iz = pme->pmegrid_start_iz;
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    g2tx = pme->pmegrid[grid_index].g2t[XX];
 +    g2ty = pme->pmegrid[grid_index].g2t[YY];
 +    g2tz = pme->pmegrid[grid_index].g2t[ZZ];
 +
 +    bThreads = (atc->nthread > 1);
 +    if (bThreads)
 +    {
 +        thread_idx = atc->thread_idx;
 +
 +        tpl   = &atc->thread_plist[thread];
 +        tpl_n = tpl->n;
 +        for (i = 0; i < atc->nthread; i++)
 +        {
 +            tpl_n[i] = 0;
 +        }
 +    }
 +
 +    for (i = start; i < end; i++)
 +    {
 +        xptr   = atc->x[i];
 +        idxptr = atc->idx[i];
 +        fptr   = atc->fractx[i];
 +
 +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 +
 +        tix = (int)(tx);
 +        tiy = (int)(ty);
 +        tiz = (int)(tz);
 +
 +        /* Because decomposition only occurs in x and y,
 +         * we never have a fraction correction in z.
 +         */
 +        fptr[XX] = tx - tix + pme->fshx[tix];
 +        fptr[YY] = ty - tiy + pme->fshy[tiy];
 +        fptr[ZZ] = tz - tiz;
 +
 +        idxptr[XX] = pme->nnx[tix];
 +        idxptr[YY] = pme->nny[tiy];
 +        idxptr[ZZ] = pme->nnz[tiz];
 +
 +#ifdef DEBUG
 +        range_check(idxptr[XX], 0, pme->pmegrid_nx);
 +        range_check(idxptr[YY], 0, pme->pmegrid_ny);
 +        range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 +#endif
 +
 +        if (bThreads)
 +        {
 +            thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 +            thread_idx[i] = thread_i;
 +            tpl_n[thread_i]++;
 +        }
 +    }
 +
 +    if (bThreads)
 +    {
 +        /* Make a list of particle indices sorted on thread */
 +
 +        /* Get the cumulative count */
 +        for (i = 1; i < atc->nthread; i++)
 +        {
 +            tpl_n[i] += tpl_n[i-1];
 +        }
 +        /* The current implementation distributes particles equally
 +         * over the threads, so we could actually allocate for that
 +         * in pme_realloc_atomcomm_things.
 +         */
 +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
 +        {
 +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 +            srenew(tpl->i, tpl->nalloc);
 +        }
 +        /* Set tpl_n to the cumulative start */
 +        for (i = atc->nthread-1; i >= 1; i--)
 +        {
 +            tpl_n[i] = tpl_n[i-1];
 +        }
 +        tpl_n[0] = 0;
 +
 +        /* Fill our thread local array with indices sorted on thread */
 +        for (i = start; i < end; i++)
 +        {
 +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 +        }
 +        /* Now tpl_n contains the cummulative count again */
 +    }
 +}
 +
 +static void make_thread_local_ind(pme_atomcomm_t *atc,
 +                                  int thread, splinedata_t *spline)
 +{
 +    int             n, t, i, start, end;
 +    thread_plist_t *tpl;
 +
 +    /* Combine the indices made by each thread into one index */
 +
 +    n     = 0;
 +    start = 0;
 +    for (t = 0; t < atc->nthread; t++)
 +    {
 +        tpl = &atc->thread_plist[t];
 +        /* Copy our part (start - end) from the list of thread t */
 +        if (thread > 0)
 +        {
 +            start = tpl->n[thread-1];
 +        }
 +        end = tpl->n[thread];
 +        for (i = start; i < end; i++)
 +        {
 +            spline->ind[n++] = tpl->i[i];
 +        }
 +    }
 +
 +    spline->n = n;
 +}
 +
 +
 +static void pme_calc_pidx(int start, int end,
 +                          matrix recipbox, rvec x[],
 +                          pme_atomcomm_t *atc, int *count)
 +{
 +    int   nslab, i;
 +    int   si;
 +    real *xptr, s;
 +    real  rxx, ryx, rzx, ryy, rzy;
 +    int  *pd;
 +
 +    /* Calculate PME task index (pidx) for each grid index.
 +     * Here we always assign equally sized slabs to each node
 +     * for load balancing reasons (the PME grid spacing is not used).
 +     */
 +
 +    nslab = atc->nslab;
 +    pd    = atc->pd;
 +
 +    /* Reset the count */
 +    for (i = 0; i < nslab; i++)
 +    {
 +        count[i] = 0;
 +    }
 +
 +    if (atc->dimind == 0)
 +    {
 +        rxx = recipbox[XX][XX];
 +        ryx = recipbox[YY][XX];
 +        rzx = recipbox[ZZ][XX];
 +        /* Calculate the node index in x-dimension */
 +        for (i = start; i < end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 +            si    = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +    else
 +    {
 +        ryy = recipbox[YY][YY];
 +        rzy = recipbox[ZZ][YY];
 +        /* Calculate the node index in y-dimension */
 +        for (i = start; i < end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 +            si    = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +}
 +
 +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 +                                  pme_atomcomm_t *atc)
 +{
 +    int nthread, thread, slab;
 +
 +    nthread = atc->nthread;
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        pme_calc_pidx(natoms* thread   /nthread,
 +                      natoms*(thread+1)/nthread,
 +                      recipbox, x, atc, atc->count_thread[thread]);
 +    }
 +    /* Non-parallel reduction, since nslab is small */
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        for (slab = 0; slab < atc->nslab; slab++)
 +        {
 +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 +        }
 +    }
 +}
 +
 +static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 +{
 +    const int padding = 4;
 +    int       i;
 +
 +    srenew(th[XX], nalloc);
 +    srenew(th[YY], nalloc);
 +    /* In z we add padding, this is only required for the aligned SIMD code */
 +    sfree_aligned(*ptr_z);
 +    snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
 +    th[ZZ] = *ptr_z + padding;
 +
 +    for (i = 0; i < padding; i++)
 +    {
 +        (*ptr_z)[               i] = 0;
 +        (*ptr_z)[padding+nalloc+i] = 0;
 +    }
 +}
 +
 +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 +{
 +    int i, d;
 +
 +    srenew(spline->ind, atc->nalloc);
 +    /* Initialize the index to identity so it works without threads */
 +    for (i = 0; i < atc->nalloc; i++)
 +    {
 +        spline->ind[i] = i;
 +    }
 +
 +    realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 +                      atc->pme_order*atc->nalloc);
 +    realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 +                      atc->pme_order*atc->nalloc);
 +}
 +
 +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 +{
 +    int nalloc_old, i, j, nalloc_tpl;
 +
 +    /* We have to avoid a NULL pointer for atc->x to avoid
 +     * possible fatal errors in MPI routines.
 +     */
 +    if (atc->n > atc->nalloc || atc->nalloc == 0)
 +    {
 +        nalloc_old  = atc->nalloc;
 +        atc->nalloc = over_alloc_dd(max(atc->n, 1));
 +
 +        if (atc->nslab > 1)
 +        {
 +            srenew(atc->x, atc->nalloc);
 +            srenew(atc->coefficient, atc->nalloc);
 +            srenew(atc->f, atc->nalloc);
 +            for (i = nalloc_old; i < atc->nalloc; i++)
 +            {
 +                clear_rvec(atc->f[i]);
 +            }
 +        }
 +        if (atc->bSpread)
 +        {
 +            srenew(atc->fractx, atc->nalloc);
 +            srenew(atc->idx, atc->nalloc);
 +
 +            if (atc->nthread > 1)
 +            {
 +                srenew(atc->thread_idx, atc->nalloc);
 +            }
 +
 +            for (i = 0; i < atc->nthread; i++)
 +            {
 +                pme_realloc_splinedata(&atc->spline[i], atc);
 +            }
 +        }
 +    }
 +}
 +
 +static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
 +                            gmx_bool gmx_unused bBackward, int gmx_unused shift,
 +                            void gmx_unused *buf_s, int gmx_unused nbyte_s,
 +                            void gmx_unused *buf_r, int gmx_unused nbyte_r)
 +{
 +#ifdef GMX_MPI
 +    int        dest, src;
 +    MPI_Status stat;
 +
 +    if (bBackward == FALSE)
 +    {
 +        dest = atc->node_dest[shift];
 +        src  = atc->node_src[shift];
 +    }
 +    else
 +    {
 +        dest = atc->node_src[shift];
 +        src  = atc->node_dest[shift];
 +    }
 +
 +    if (nbyte_s > 0 && nbyte_r > 0)
 +    {
 +        MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 +                     dest, shift,
 +                     buf_r, nbyte_r, MPI_BYTE,
 +                     src, shift,
 +                     atc->mpi_comm, &stat);
 +    }
 +    else if (nbyte_s > 0)
 +    {
 +        MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 +                 dest, shift,
 +                 atc->mpi_comm);
 +    }
 +    else if (nbyte_r > 0)
 +    {
 +        MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 +                 src, shift,
 +                 atc->mpi_comm, &stat);
 +    }
 +#endif
 +}
 +
 +static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
 +                                    int n, gmx_bool bX, rvec *x, real *data,
 +                                    pme_atomcomm_t *atc)
 +{
 +    int *commnode, *buf_index;
 +    int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 +
 +    nsend = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        buf_index[commnode[i]] = nsend;
 +        nsend                 += atc->count[commnode[i]];
 +    }
 +    if (bX)
 +    {
 +        if (atc->count[atc->nodeid] + nsend != n)
 +        {
 +            gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 +                      "This usually means that your system is not well equilibrated.",
 +                      n - (atc->count[atc->nodeid] + nsend),
 +                      pme->nodeid, 'x'+atc->dimind);
 +        }
 +
 +        if (nsend > pme->buf_nalloc)
 +        {
 +            pme->buf_nalloc = over_alloc_dd(nsend);
 +            srenew(pme->bufv, pme->buf_nalloc);
 +            srenew(pme->bufr, pme->buf_nalloc);
 +        }
 +
 +        atc->n = atc->count[atc->nodeid];
 +        for (i = 0; i < nnodes_comm; i++)
 +        {
 +            scount = atc->count[commnode[i]];
 +            /* Communicate the count */
 +            if (debug)
 +            {
 +                fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
 +                        atc->dimind, atc->nodeid, commnode[i], scount);
 +            }
 +            pme_dd_sendrecv(atc, FALSE, i,
 +                            &scount, sizeof(int),
 +                            &atc->rcount[i], sizeof(int));
 +            atc->n += atc->rcount[i];
 +        }
 +
 +        pme_realloc_atomcomm_things(atc);
 +    }
 +
 +    local_pos = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        node = atc->pd[i];
 +        if (node == atc->nodeid)
 +        {
 +            /* Copy direct to the receive buffer */
 +            if (bX)
 +            {
 +                copy_rvec(x[i], atc->x[local_pos]);
 +            }
 +            atc->coefficient[local_pos] = data[i];
 +            local_pos++;
 +        }
 +        else
 +        {
 +            /* Copy to the send buffer */
 +            if (bX)
 +            {
 +                copy_rvec(x[i], pme->bufv[buf_index[node]]);
 +            }
 +            pme->bufr[buf_index[node]] = data[i];
 +            buf_index[node]++;
 +        }
 +    }
 +
 +    buf_pos = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        scount = atc->count[commnode[i]];
 +        rcount = atc->rcount[i];
 +        if (scount > 0 || rcount > 0)
 +        {
 +            if (bX)
 +            {
 +                /* Communicate the coordinates */
 +                pme_dd_sendrecv(atc, FALSE, i,
 +                                pme->bufv[buf_pos], scount*sizeof(rvec),
 +                                atc->x[local_pos], rcount*sizeof(rvec));
 +            }
 +            /* Communicate the coefficients */
 +            pme_dd_sendrecv(atc, FALSE, i,
 +                            pme->bufr+buf_pos, scount*sizeof(real),
 +                            atc->coefficient+local_pos, rcount*sizeof(real));
 +            buf_pos   += scount;
 +            local_pos += atc->rcount[i];
 +        }
 +    }
 +}
 +
 +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                           int n, rvec *f,
 +                           gmx_bool bAddF)
 +{
 +    int *commnode, *buf_index;
 +    int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 +
 +    local_pos = atc->count[atc->nodeid];
 +    buf_pos   = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        scount = atc->rcount[i];
 +        rcount = atc->count[commnode[i]];
 +        if (scount > 0 || rcount > 0)
 +        {
 +            /* Communicate the forces */
 +            pme_dd_sendrecv(atc, TRUE, i,
 +                            atc->f[local_pos], scount*sizeof(rvec),
 +                            pme->bufv[buf_pos], rcount*sizeof(rvec));
 +            local_pos += scount;
 +        }
 +        buf_index[commnode[i]] = buf_pos;
 +        buf_pos               += rcount;
 +    }
 +
 +    local_pos = 0;
 +    if (bAddF)
 +    {
 +        for (i = 0; i < n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Add from the local force array */
 +                rvec_inc(f[i], atc->f[local_pos]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Add from the receive buffer */
 +                rvec_inc(f[i], pme->bufv[buf_index[node]]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Copy from the local force array */
 +                copy_rvec(atc->f[local_pos], f[i]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Copy from the receive buffer */
 +                copy_rvec(pme->bufv[buf_index[node]], f[i]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 +{
 +    pme_overlap_t *overlap;
 +    int            send_index0, send_nindex;
 +    int            recv_index0, recv_nindex;
 +    MPI_Status     stat;
 +    int            i, j, k, ix, iy, iz, icnt;
 +    int            ipulse, send_id, recv_id, datasize;
 +    real          *p;
 +    real          *sendptr, *recvptr;
 +
 +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 +    overlap = &pme->overlap[1];
 +
 +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +    {
 +        /* Since we have already (un)wrapped the overlap in the z-dimension,
 +         * we only have to communicate 0 to nkz (not pmegrid_nz).
 +         */
 +        if (direction == GMX_SUM_GRID_FORWARD)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +        }
 +        else
 +        {
 +            send_id       = overlap->recv_id[ipulse];
 +            recv_id       = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        }
 +
 +        /* Copy data to contiguous send buffer */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, send_id,
 +                    pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy+send_nindex);
 +        }
 +        icnt = 0;
 +        for (i = 0; i < pme->pmegrid_nx; i++)
 +        {
 +            ix = i;
 +            for (j = 0; j < send_nindex; j++)
 +            {
 +                iy = j + send_index0 - pme->pmegrid_start_iy;
 +                for (k = 0; k < pme->nkz; k++)
 +                {
 +                    iz = k;
 +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                }
 +            }
 +        }
 +
 +        datasize      = pme->pmegrid_nx * pme->nkz;
 +
 +        MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +
 +        /* Get data from contiguous recv buffer */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, recv_id,
 +                    pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
 +        }
 +        icnt = 0;
 +        for (i = 0; i < pme->pmegrid_nx; i++)
 +        {
 +            ix = i;
 +            for (j = 0; j < recv_nindex; j++)
 +            {
 +                iy = j + recv_index0 - pme->pmegrid_start_iy;
 +                for (k = 0; k < pme->nkz; k++)
 +                {
 +                    iz = k;
 +                    if (direction == GMX_SUM_GRID_FORWARD)
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 +                    }
 +                    else
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Major dimension is easier, no copying required,
 +     * but we might have to sum to separate array.
 +     * Since we don't copy, we have to communicate up to pmegrid_nz,
 +     * not nkz as for the minor direction.
 +     */
 +    overlap = &pme->overlap[0];
 +
 +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +    {
 +        if (direction == GMX_SUM_GRID_FORWARD)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recvptr       = overlap->recvbuf;
 +        }
 +        else
 +        {
 +            send_id       = overlap->recv_id[ipulse];
 +            recv_id       = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        }
 +
 +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, send_id,
 +                    pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix+send_nindex);
 +            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, recv_id,
 +                    pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
 +        }
 +
 +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +
 +        /* ADD data from contiguous recv buffer */
 +        if (direction == GMX_SUM_GRID_FORWARD)
 +        {
 +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +            for (i = 0; i < recv_nindex*datasize; i++)
 +            {
 +                p[i] += overlap->recvbuf[i];
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +
 +static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
 +{
 +    ivec    local_fft_ndata, local_fft_offset, local_fft_size;
 +    ivec    local_pme_size;
 +    int     i, ix, iy, iz;
 +    int     pmeidx, fftidx;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +       the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    {
 +#ifdef DEBUG_PME
 +        FILE *fp, *fp2;
 +        char  fn[STRLEN];
 +        real  val;
 +        sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
 +        fp = gmx_ffopen(fn, "w");
 +        sprintf(fn, "pmegrid%d.txt", pme->nodeid);
 +        fp2 = gmx_ffopen(fn, "w");
 +#endif
 +
 +        for (ix = 0; ix < local_fft_ndata[XX]; ix++)
 +        {
 +            for (iy = 0; iy < local_fft_ndata[YY]; iy++)
 +            {
 +                for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 +                {
 +                    pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 +                    fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 +                    fftgrid[fftidx] = pmegrid[pmeidx];
 +#ifdef DEBUG_PME
 +                    val = 100*pmegrid[pmeidx];
 +                    if (pmegrid[pmeidx] != 0)
 +                    {
 +                        gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
 +                                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
 +                    }
 +                    if (pmegrid[pmeidx] != 0)
 +                    {
 +                        fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
 +                                "qgrid",
 +                                pme->pmegrid_start_ix + ix,
 +                                pme->pmegrid_start_iy + iy,
 +                                pme->pmegrid_start_iz + iz,
 +                                pmegrid[pmeidx]);
 +                    }
 +#endif
 +                }
 +            }
 +        }
 +#ifdef DEBUG_PME
 +        gmx_ffclose(fp);
 +        gmx_ffclose(fp2);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +
 +static gmx_cycles_t omp_cyc_start()
 +{
 +    return gmx_cycles_read();
 +}
 +
 +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 +{
 +    return gmx_cycles_read() - c;
 +}
 +
 +
 +static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
 +                                   int nthread, int thread)
 +{
 +    ivec          local_fft_ndata, local_fft_offset, local_fft_size;
 +    ivec          local_pme_size;
 +    int           ixy0, ixy1, ixy, ix, iy, iz;
 +    int           pmeidx, fftidx;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t  c1;
 +    static double cs1 = 0;
 +    static int    cnt = 0;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +       the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +
 +    for (ixy = ixy0; ixy < ixy1; ixy++)
 +    {
 +        ix = ixy/local_fft_ndata[YY];
 +        iy = ixy - ix*local_fft_ndata[YY];
 +
 +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 +        for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 +        {
 +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    c1   = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("copy %.2f\n", cs1*1e-9);
 +    }
 +#endif
 +
 +    return 0;
 +}
 +
 +
 +static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    /* Add periodic overlap in z */
 +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +    {
 +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
 +        {
 +            for (iz = 0; iz < overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +        {
 +            for (iy = 0; iy < overlap; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[(ix*pny+ny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for (ix = 0; ix < overlap; ix++)
 +        {
 +            for (iy = 0; iy < ny_x; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for (ix = 0; ix < overlap; ix++)
 +        {
 +            int iy, iz;
 +
 +            for (iy = 0; iy < ny_x; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +        {
 +            int iy, iz;
 +
 +            for (iy = 0; iy < overlap; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Copy periodic overlap in z */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +    {
 +        int iy, iz;
 +
 +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
 +        {
 +            for (iz = 0; iz < overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 +                    pmegrid[(ix*pny+iy)*pnz+iz];
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 +#define DO_BSPLINE(order)                            \
 +    for (ithx = 0; (ithx < order); ithx++)                    \
 +    {                                                    \
 +        index_x = (i0+ithx)*pny*pnz;                     \
 +        valx    = coefficient*thx[ithx];                          \
 +                                                     \
 +        for (ithy = 0; (ithy < order); ithy++)                \
 +        {                                                \
 +            valxy    = valx*thy[ithy];                   \
 +            index_xy = index_x+(j0+ithy)*pnz;            \
 +                                                     \
 +            for (ithz = 0; (ithz < order); ithz++)            \
 +            {                                            \
 +                index_xyz        = index_xy+(k0+ithz);   \
 +                grid[index_xyz] += valxy*thz[ithz];      \
 +            }                                            \
 +        }                                                \
 +    }
 +
 +
 +static void spread_coefficients_bsplines_thread(pmegrid_t                    *pmegrid,
 +                                                pme_atomcomm_t               *atc,
 +                                                splinedata_t                 *spline,
 +                                                pme_spline_work_t gmx_unused *work)
 +{
 +
 +    /* spread coefficients from home atoms to local grid */
 +    real          *grid;
 +    pme_overlap_t *ol;
 +    int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
 +    int       *    idxptr;
 +    int            order, norder, index_x, index_xy, index_xyz;
 +    real           valx, valxy, coefficient;
 +    real          *thx, *thy, *thz;
 +    int            localsize, bndsize;
 +    int            pnx, pny, pnz, ndatatot;
 +    int            offx, offy, offz;
 +
 +#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
 +    real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
 +
 +    thz_aligned = gmx_simd4_align_r(thz_buffer);
 +#endif
 +
 +    pnx = pmegrid->s[XX];
 +    pny = pmegrid->s[YY];
 +    pnz = pmegrid->s[ZZ];
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    ndatatot = pnx*pny*pnz;
 +    grid     = pmegrid->grid;
 +    for (i = 0; i < ndatatot; i++)
 +    {
 +        grid[i] = 0;
 +    }
 +
 +    order = pmegrid->order;
 +
 +    for (nn = 0; nn < spline->n; nn++)
 +    {
 +        n           = spline->ind[nn];
 +        coefficient = atc->coefficient[n];
 +
 +        if (coefficient != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX] - offx;
 +            j0   = idxptr[YY] - offy;
 +            k0   = idxptr[ZZ] - offz;
 +
 +            thx = spline->theta[XX] + norder;
 +            thy = spline->theta[YY] + norder;
 +            thz = spline->theta[ZZ] + norder;
 +
 +            switch (order)
 +            {
 +                case 4:
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +#ifdef PME_SIMD4_UNALIGNED
 +#define PME_SPREAD_SIMD4_ORDER4
 +#else
 +#define PME_SPREAD_SIMD4_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_simd4.h"
 +#else
 +                    DO_BSPLINE(4);
 +#endif
 +                    break;
 +                case 5:
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +#define PME_SPREAD_SIMD4_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_simd4.h"
 +#else
 +                    DO_BSPLINE(5);
 +#endif
 +                    break;
 +                default:
 +                    DO_BSPLINE(order);
 +                    break;
 +            }
 +        }
 +    }
 +}
 +
 +static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
 +{
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +    if (pme_order == 5
 +#ifndef PME_SIMD4_UNALIGNED
 +        || pme_order == 4
 +#endif
 +        )
 +    {
 +        /* Round nz up to a multiple of 4 to ensure alignment */
 +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 +    }
 +#endif
 +}
 +
 +static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
 +{
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +#ifndef PME_SIMD4_UNALIGNED
 +    if (pme_order == 4)
 +    {
 +        /* Add extra elements to ensured aligned operations do not go
 +         * beyond the allocated grid size.
 +         * Note that for pme_order=5, the pme grid z-size alignment
 +         * ensures that we will not go beyond the grid size.
 +         */
 +        *gridsize += 4;
 +    }
 +#endif
 +#endif
 +}
 +
 +static void pmegrid_init(pmegrid_t *grid,
 +                         int cx, int cy, int cz,
 +                         int x0, int y0, int z0,
 +                         int x1, int y1, int z1,
 +                         gmx_bool set_alignment,
 +                         int pme_order,
 +                         real *ptr)
 +{
 +    int nz, gridsize;
 +
 +    grid->ci[XX]     = cx;
 +    grid->ci[YY]     = cy;
 +    grid->ci[ZZ]     = cz;
 +    grid->offset[XX] = x0;
 +    grid->offset[YY] = y0;
 +    grid->offset[ZZ] = z0;
 +    grid->n[XX]      = x1 - x0 + pme_order - 1;
 +    grid->n[YY]      = y1 - y0 + pme_order - 1;
 +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 +    copy_ivec(grid->n, grid->s);
 +
 +    nz = grid->s[ZZ];
 +    set_grid_alignment(&nz, pme_order);
 +    if (set_alignment)
 +    {
 +        grid->s[ZZ] = nz;
 +    }
 +    else if (nz != grid->s[ZZ])
 +    {
 +        gmx_incons("pmegrid_init call with an unaligned z size");
 +    }
 +
 +    grid->order = pme_order;
 +    if (ptr == NULL)
 +    {
 +        gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
 +        set_gridsize_alignment(&gridsize, pme_order);
 +        snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
 +    }
 +    else
 +    {
 +        grid->grid = ptr;
 +    }
 +}
 +
 +static int div_round_up(int enumerator, int denominator)
 +{
 +    return (enumerator + denominator - 1)/denominator;
 +}
 +
 +static void make_subgrid_division(const ivec n, int ovl, int nthread,
 +                                  ivec nsub)
 +{
 +    int gsize_opt, gsize;
 +    int nsx, nsy, nsz;
 +    char *env;
 +
 +    gsize_opt = -1;
 +    for (nsx = 1; nsx <= nthread; nsx++)
 +    {
 +        if (nthread % nsx == 0)
 +        {
 +            for (nsy = 1; nsy <= nthread; nsy++)
 +            {
 +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 +                {
 +                    nsz = nthread/(nsx*nsy);
 +
 +                    /* Determine the number of grid points per thread */
 +                    gsize =
 +                        (div_round_up(n[XX], nsx) + ovl)*
 +                        (div_round_up(n[YY], nsy) + ovl)*
 +                        (div_round_up(n[ZZ], nsz) + ovl);
 +
 +                    /* Minimize the number of grids points per thread
 +                     * and, secondarily, the number of cuts in minor dimensions.
 +                     */
 +                    if (gsize_opt == -1 ||
 +                        gsize < gsize_opt ||
 +                        (gsize == gsize_opt &&
 +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 +                    {
 +                        nsub[XX]  = nsx;
 +                        nsub[YY]  = nsy;
 +                        nsub[ZZ]  = nsz;
 +                        gsize_opt = gsize;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    env = getenv("GMX_PME_THREAD_DIVISION");
 +    if (env != NULL)
 +    {
 +        sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
 +    }
 +
 +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 +    {
 +        gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
 +    }
 +}
 +
 +static void pmegrids_init(pmegrids_t *grids,
 +                          int nx, int ny, int nz, int nz_base,
 +                          int pme_order,
 +                          gmx_bool bUseThreads,
 +                          int nthread,
 +                          int overlap_x,
 +                          int overlap_y)
 +{
 +    ivec n, n_base, g0, g1;
 +    int t, x, y, z, d, i, tfac;
 +    int max_comm_lines = -1;
 +
 +    n[XX] = nx - (pme_order - 1);
 +    n[YY] = ny - (pme_order - 1);
 +    n[ZZ] = nz - (pme_order - 1);
 +
 +    copy_ivec(n, n_base);
 +    n_base[ZZ] = nz_base;
 +
 +    pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
 +                 NULL);
 +
 +    grids->nthread = nthread;
 +
 +    make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 +
 +    if (bUseThreads)
 +    {
 +        ivec nst;
 +        int gridsize;
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
 +        }
 +        set_grid_alignment(&nst[ZZ], pme_order);
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
 +                    grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
 +            fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 +                    nx, ny, nz,
 +                    nst[XX], nst[YY], nst[ZZ]);
 +        }
 +
 +        snew(grids->grid_th, grids->nthread);
 +        t        = 0;
 +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
 +        set_gridsize_alignment(&gridsize, pme_order);
 +        snew_aligned(grids->grid_all,
 +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 +                     SIMD4_ALIGNMENT);
 +
 +        for (x = 0; x < grids->nc[XX]; x++)
 +        {
 +            for (y = 0; y < grids->nc[YY]; y++)
 +            {
 +                for (z = 0; z < grids->nc[ZZ]; z++)
 +                {
 +                    pmegrid_init(&grids->grid_th[t],
 +                                 x, y, z,
 +                                 (n[XX]*(x  ))/grids->nc[XX],
 +                                 (n[YY]*(y  ))/grids->nc[YY],
 +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
 +                                 (n[XX]*(x+1))/grids->nc[XX],
 +                                 (n[YY]*(y+1))/grids->nc[YY],
 +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
 +                                 TRUE,
 +                                 pme_order,
 +                                 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 +                    t++;
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        grids->grid_th = NULL;
 +    }
 +
 +    snew(grids->g2t, DIM);
 +    tfac = 1;
 +    for (d = DIM-1; d >= 0; d--)
 +    {
 +        snew(grids->g2t[d], n[d]);
 +        t = 0;
 +        for (i = 0; i < n[d]; i++)
 +        {
 +            /* The second check should match the parameters
 +             * of the pmegrid_init call above.
 +             */
 +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 +            {
 +                t++;
 +            }
 +            grids->g2t[d][i] = t*tfac;
 +        }
 +
 +        tfac *= grids->nc[d];
 +
 +        switch (d)
 +        {
 +            case XX: max_comm_lines = overlap_x;     break;
 +            case YY: max_comm_lines = overlap_y;     break;
 +            case ZZ: max_comm_lines = pme_order - 1; break;
 +        }
 +        grids->nthread_comm[d] = 0;
 +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
 +               grids->nthread_comm[d] < grids->nc[d])
 +        {
 +            grids->nthread_comm[d]++;
 +        }
 +        if (debug != NULL)
 +        {
 +            fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
 +                    'x'+d, grids->nthread_comm[d]);
 +        }
 +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 +         * work, but this is not a problematic restriction.
 +         */
 +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 +        {
 +            gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
 +        }
 +    }
 +}
 +
 +
 +static void pmegrids_destroy(pmegrids_t *grids)
 +{
 +    int t;
 +
 +    if (grids->grid.grid != NULL)
 +    {
 +        sfree(grids->grid.grid);
 +
 +        if (grids->nthread > 0)
 +        {
 +            for (t = 0; t < grids->nthread; t++)
 +            {
 +                sfree(grids->grid_th[t].grid);
 +            }
 +            sfree(grids->grid_th);
 +        }
 +    }
 +}
 +
 +
 +static void realloc_work(pme_work_t *work, int nkx)
 +{
 +    int simd_width;
 +
 +    if (nkx > work->nalloc)
 +    {
 +        work->nalloc = nkx;
 +        srenew(work->mhx, work->nalloc);
 +        srenew(work->mhy, work->nalloc);
 +        srenew(work->mhz, work->nalloc);
 +        srenew(work->m2, work->nalloc);
 +        /* Allocate an aligned pointer for SIMD operations, including extra
 +         * elements at the end for padding.
 +         */
 +#ifdef PME_SIMD_SOLVE
 +        simd_width = GMX_SIMD_REAL_WIDTH;
 +#else
 +        /* We can use any alignment, apart from 0, so we use 4 */
 +        simd_width = 4;
 +#endif
 +        sfree_aligned(work->denom);
 +        sfree_aligned(work->tmp1);
 +        sfree_aligned(work->tmp2);
 +        sfree_aligned(work->eterm);
 +        snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
 +        snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
 +        snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
 +        snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
 +        srenew(work->m2inv, work->nalloc);
 +    }
 +}
 +
 +
 +static void free_work(pme_work_t *work)
 +{
 +    sfree(work->mhx);
 +    sfree(work->mhy);
 +    sfree(work->mhz);
 +    sfree(work->m2);
 +    sfree_aligned(work->denom);
 +    sfree_aligned(work->tmp1);
 +    sfree_aligned(work->tmp2);
 +    sfree_aligned(work->eterm);
 +    sfree(work->m2inv);
 +}
 +
 +
 +#if defined PME_SIMD_SOLVE
 +/* Calculate exponentials through SIMD */
 +gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 +{
 +    {
 +        const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
 +        gmx_simd_real_t f_simd;
 +        gmx_simd_real_t lu;
 +        gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
 +        int kx;
 +        f_simd = gmx_simd_set1_r(f);
 +        /* We only need to calculate from start. But since start is 0 or 1
 +         * and we want to use aligned loads/stores, we always start from 0.
 +         */
 +        for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
 +        {
 +            tmp_d1   = gmx_simd_load_r(d_aligned+kx);
 +            d_inv    = gmx_simd_inv_r(tmp_d1);
 +            tmp_r    = gmx_simd_load_r(r_aligned+kx);
 +            tmp_r    = gmx_simd_exp_r(tmp_r);
 +            tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
 +            tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
 +            gmx_simd_store_r(e_aligned+kx, tmp_e);
 +        }
 +    }
 +}
 +#else
 +gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
 +{
 +    int kx;
 +    for (kx = start; kx < end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +    for (kx = start; kx < end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +    for (kx = start; kx < end; kx++)
 +    {
 +        e[kx] = f*r[kx]*d[kx];
 +    }
 +}
 +#endif
 +
 +#if defined PME_SIMD_SOLVE
 +/* Calculate exponentials through SIMD */
 +gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
 +{
 +    gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
 +    const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
 +    int kx;
 +    for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
 +    {
 +        /* We only need to calculate from start. But since start is 0 or 1
 +         * and we want to use aligned loads/stores, we always start from 0.
 +         */
 +        tmp_d = gmx_simd_load_r(d_aligned+kx);
 +        d_inv = gmx_simd_inv_r(tmp_d);
 +        gmx_simd_store_r(d_aligned+kx, d_inv);
 +        tmp_r = gmx_simd_load_r(r_aligned+kx);
 +        tmp_r = gmx_simd_exp_r(tmp_r);
 +        gmx_simd_store_r(r_aligned+kx, tmp_r);
 +        tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
 +        tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
 +        gmx_simd_store_r(factor_aligned+kx, tmp_fac);
 +    }
 +}
 +#else
 +gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
 +{
 +    int kx;
 +    real mk;
 +    for (kx = start; kx < end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +
 +    for (kx = start; kx < end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +
 +    for (kx = start; kx < end; kx++)
 +    {
 +        mk       = tmp2[kx];
 +        tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
 +    }
 +}
 +#endif
 +
 +static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
 +                         real ewaldcoeff, real vol,
 +                         gmx_bool bEnerVir,
 +                         int nthread, int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
 +    int     kx, ky, kz, maxkx, maxky, maxkz;
 +    int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
 +    real    mx, my, mz;
 +    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2, struct2, vfactor, ets2vf;
 +    real    d1, d2, energy = 0;
 +    real    by, bz;
 +    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    pme_work_t *work;
 +    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
 +    real    mhxk, mhyk, mhzk, m2k;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata, local_offset, local_size;
 +    real    elfac;
 +
 +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
 +
 +    work  = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    eterm = work->eterm;
 +    m2inv = work->m2inv;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for (iyz = iyz0; iyz < iyz1; iyz++)
 +    {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
 +        ky = iy + local_offset[YY];
 +
 +        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
 +        else
 +        {
 +            my = (ky - ny);
 +        }
 +
 +        by = M_PI*vol*pme->bsp_mod[YY][ky];
 +
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +
 +        /* We should skip the k-space point (0,0,0) */
 +        /* Note that since here x is the minor index, local_offset[XX]=0 */
 +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 +        {
 +            kxstart = local_offset[XX];
 +        }
 +        else
 +        {
 +            kxstart = local_offset[XX] + 1;
 +            p0++;
 +        }
 +        kxend = local_offset[XX] + local_ndata[XX];
 +
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the storage
 +             * of the mh elements in array's.
 +             * Because x is the minor grid index, all mh elements
 +             * depend on kx for triclinic unit cells.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                m2inv[kx] = 1.0/m2[kx];
 +            }
 +
 +            calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
 +
 +            for (kx = kxstart; kx < kxend; kx++, p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +
 +                struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                tmp1[kx] = eterm[kx]*struct2;
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                ets2     = corner_fac*tmp1[kx];
 +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 +                energy  += ets2;
 +
 +                ets2vf   = ets2*vfactor;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             * In this case the triclinic overhead is small.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
 +
 +            for (kx = kxstart; kx < kxend; kx++, p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +            }
 +        }
 +    }
 +
 +    if (bEnerVir)
 +    {
 +        /* Update virial with local values.
 +         * The virial is symmetric by definition.
 +         * this virial seems ok for isotropic scaling, but I'm
 +         * experiencing problems on semiisotropic membranes.
 +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 +         */
 +        work->vir_q[XX][XX] = 0.25*virxx;
 +        work->vir_q[YY][YY] = 0.25*viryy;
 +        work->vir_q[ZZ][ZZ] = 0.25*virzz;
 +        work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
 +        work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
 +        work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
 +
 +        /* This energy should be corrected for a charged system */
 +        work->energy_q = 0.5*energy;
 +    }
 +
 +    /* Return the loop count */
 +    return local_ndata[YY]*local_ndata[XX];
 +}
 +
 +static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
 +                            real ewaldcoeff, real vol,
 +                            gmx_bool bEnerVir, int nthread, int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    int     ig, gcount;
 +    int     kx, ky, kz, maxkx, maxky, maxkz;
 +    int     nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
 +    real    mx, my, mz;
 +    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2, ets2vf;
 +    real    eterm, vterm, d1, d2, energy = 0;
 +    real    by, bz;
 +    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
 +    real    mhxk, mhyk, mhzk, m2k;
 +    real    mk;
 +    pme_work_t *work;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata, local_offset, local_size;
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
 +
 +    work  = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    tmp2  = work->tmp2;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for (iyz = iyz0; iyz < iyz1; iyz++)
 +    {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
 +        ky = iy + local_offset[YY];
 +
 +        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
 +        else
 +        {
 +            my = (ky - ny);
 +        }
 +
 +        by = 3.0*vol*pme->bsp_mod[YY][ky]
 +            / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
 +
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        kxstart = local_offset[XX];
 +        kxend   = local_offset[XX] + local_ndata[XX];
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the
 +             * storage of the mh elements in array's.  Because x is the
 +             * minor grid index, all mh elements depend on kx for
 +             * triclinic unit cells.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +                tmp2[kx]  = sqrt(factor*m2k);
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +                tmp2[kx]  = sqrt(factor*m2k);
 +            }
 +
 +            calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                m2k   = factor*m2[kx];
 +                eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
 +                          + 2.0*m2k*tmp2[kx]);
 +                vterm    = 3.0*(-tmp1[kx] + tmp2[kx]);
 +                tmp1[kx] = eterm*denom[kx];
 +                tmp2[kx] = vterm*denom[kx];
 +            }
 +
 +            if (!bLB)
 +            {
 +                t_complex *p0;
 +                real       struct2;
 +
 +                p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +                for (kx = kxstart; kx < kxend; kx++, p0++)
 +                {
 +                    d1      = p0->re;
 +                    d2      = p0->im;
 +
 +                    eterm   = tmp1[kx];
 +                    vterm   = tmp2[kx];
 +                    p0->re  = d1*eterm;
 +                    p0->im  = d2*eterm;
 +
 +                    struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                    tmp1[kx] = eterm*struct2;
 +                    tmp2[kx] = vterm*struct2;
 +                }
 +            }
 +            else
 +            {
 +                real *struct2 = denom;
 +                real  str2;
 +
 +                for (kx = kxstart; kx < kxend; kx++)
 +                {
 +                    struct2[kx] = 0.0;
 +                }
 +                /* Due to symmetry we only need to calculate 4 of the 7 terms */
 +                for (ig = 0; ig <= 3; ++ig)
 +                {
 +                    t_complex *p0, *p1;
 +                    real       scale;
 +
 +                    p0    = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +                    p1    = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +                    scale = 2.0*lb_scale_factor_symm[ig];
 +                    for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
 +                    {
 +                        struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
 +                    }
 +
 +                }
 +                for (ig = 0; ig <= 6; ++ig)
 +                {
 +                    t_complex *p0;
 +
 +                    p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +                    for (kx = kxstart; kx < kxend; kx++, p0++)
 +                    {
 +                        d1     = p0->re;
 +                        d2     = p0->im;
 +
 +                        eterm  = tmp1[kx];
 +                        p0->re = d1*eterm;
 +                        p0->im = d2*eterm;
 +                    }
 +                }
 +                for (kx = kxstart; kx < kxend; kx++)
 +                {
 +                    eterm    = tmp1[kx];
 +                    vterm    = tmp2[kx];
 +                    str2     = struct2[kx];
 +                    tmp1[kx] = eterm*str2;
 +                    tmp2[kx] = vterm*str2;
 +                }
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                ets2     = corner_fac*tmp1[kx];
 +                vterm    = 2.0*factor*tmp2[kx];
 +                energy  += ets2;
 +                ets2vf   = corner_fac*vterm;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             *  In this case the triclinic overhead is small.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +                tmp2[kx]  = sqrt(factor*m2k);
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +                tmp2[kx]  = sqrt(factor*m2k);
 +            }
 +
 +            calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                m2k    = factor*m2[kx];
 +                eterm  = -((1.0 - 2.0*m2k)*tmp1[kx]
 +                           + 2.0*m2k*tmp2[kx]);
 +                tmp1[kx] = eterm*denom[kx];
 +            }
 +            gcount = (bLB ? 7 : 1);
 +            for (ig = 0; ig < gcount; ++ig)
 +            {
 +                t_complex *p0;
 +
 +                p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +                for (kx = kxstart; kx < kxend; kx++, p0++)
 +                {
 +                    d1      = p0->re;
 +                    d2      = p0->im;
 +
 +                    eterm   = tmp1[kx];
 +
 +                    p0->re  = d1*eterm;
 +                    p0->im  = d2*eterm;
 +                }
 +            }
 +        }
 +    }
 +    if (bEnerVir)
 +    {
 +        work->vir_lj[XX][XX] = 0.25*virxx;
 +        work->vir_lj[YY][YY] = 0.25*viryy;
 +        work->vir_lj[ZZ][ZZ] = 0.25*virzz;
 +        work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
 +        work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
 +        work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
 +
 +        /* This energy should be corrected for a charged system */
 +        work->energy_lj = 0.5*energy;
 +    }
 +    /* Return the loop count */
 +    return local_ndata[YY]*local_ndata[XX];
 +}
 +
 +static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
 +                               real *mesh_energy, matrix vir)
 +{
 +    /* This function sums output over threads and should therefore
 +     * only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy_q;
 +    copy_mat(pme->work[0].vir_q, vir);
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy_q;
 +        m_add(vir, pme->work[thread].vir_q, vir);
 +    }
 +}
 +
 +static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
 +                                real *mesh_energy, matrix vir)
 +{
 +    /* This function sums output over threads and should therefore
 +     * only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy_lj;
 +    copy_mat(pme->work[0].vir_lj, vir);
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy_lj;
 +        m_add(vir, pme->work[thread].vir_lj, vir);
 +    }
 +}
 +
 +
 +#define DO_FSPLINE(order)                      \
 +    for (ithx = 0; (ithx < order); ithx++)              \
 +    {                                              \
 +        index_x = (i0+ithx)*pny*pnz;               \
 +        tx      = thx[ithx];                       \
 +        dx      = dthx[ithx];                      \
 +                                               \
 +        for (ithy = 0; (ithy < order); ithy++)          \
 +        {                                          \
 +            index_xy = index_x+(j0+ithy)*pnz;      \
 +            ty       = thy[ithy];                  \
 +            dy       = dthy[ithy];                 \
 +            fxy1     = fz1 = 0;                    \
 +                                               \
 +            for (ithz = 0; (ithz < order); ithz++)      \
 +            {                                      \
 +                gval  = grid[index_xy+(k0+ithz)];  \
 +                fxy1 += thz[ithz]*gval;            \
 +                fz1  += dthz[ithz]*gval;           \
 +            }                                      \
 +            fx += dx*ty*fxy1;                      \
 +            fy += tx*dy*fxy1;                      \
 +            fz += tx*ty*fz1;                       \
 +        }                                          \
 +    }
 +
 +
 +static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 +                              gmx_bool bClearF, pme_atomcomm_t *atc,
 +                              splinedata_t *spline,
 +                              real scale)
 +{
 +    /* sum forces for local particles */
 +    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
 +    int     index_x, index_xy;
 +    int     nx, ny, nz, pnx, pny, pnz;
 +    int *   idxptr;
 +    real    tx, ty, dx, dy, coefficient;
 +    real    fx, fy, fz, gval;
 +    real    fxy1, fz1;
 +    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
 +    int     norder;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    int     order;
 +
 +    pme_spline_work_t *work;
 +
 +#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
 +    real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
 +    real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
 +
 +    thz_aligned  = gmx_simd4_align_r(thz_buffer);
 +    dthz_aligned = gmx_simd4_align_r(dthz_buffer);
 +#endif
 +
 +    work = pme->spline_work;
 +
 +    order = pme->pme_order;
 +    thx   = spline->theta[XX];
 +    thy   = spline->theta[YY];
 +    thz   = spline->theta[ZZ];
 +    dthx  = spline->dtheta[XX];
 +    dthy  = spline->dtheta[YY];
 +    dthz  = spline->dtheta[ZZ];
 +    nx    = pme->nkx;
 +    ny    = pme->nky;
 +    nz    = pme->nkz;
 +    pnx   = pme->pmegrid_nx;
 +    pny   = pme->pmegrid_ny;
 +    pnz   = pme->pmegrid_nz;
 +
 +    rxx   = pme->recipbox[XX][XX];
 +    ryx   = pme->recipbox[YY][XX];
 +    ryy   = pme->recipbox[YY][YY];
 +    rzx   = pme->recipbox[ZZ][XX];
 +    rzy   = pme->recipbox[ZZ][YY];
 +    rzz   = pme->recipbox[ZZ][ZZ];
 +
 +    for (nn = 0; nn < spline->n; nn++)
 +    {
 +        n           = spline->ind[nn];
 +        coefficient = scale*atc->coefficient[n];
 +
 +        if (bClearF)
 +        {
 +            atc->f[n][XX] = 0;
 +            atc->f[n][YY] = 0;
 +            atc->f[n][ZZ] = 0;
 +        }
 +        if (coefficient != 0)
 +        {
 +            fx     = 0;
 +            fy     = 0;
 +            fz     = 0;
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next six statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +            dthx = spline->dtheta[XX] + norder;
 +            dthy = spline->dtheta[YY] + norder;
 +            dthz = spline->dtheta[ZZ] + norder;
 +
 +            switch (order)
 +            {
 +                case 4:
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +#ifdef PME_SIMD4_UNALIGNED
 +#define PME_GATHER_F_SIMD4_ORDER4
 +#else
 +#define PME_GATHER_F_SIMD4_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_simd4.h"
 +#else
 +                    DO_FSPLINE(4);
 +#endif
 +                    break;
 +                case 5:
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +#define PME_GATHER_F_SIMD4_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_simd4.h"
 +#else
 +                    DO_FSPLINE(5);
 +#endif
 +                    break;
 +                default:
 +                    DO_FSPLINE(order);
 +                    break;
 +            }
 +
 +            atc->f[n][XX] += -coefficient*( fx*nx*rxx );
 +            atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
 +            atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
 +        }
 +    }
 +    /* Since the energy and not forces are interpolated
 +     * the net force might not be exactly zero.
 +     * This can be solved by also interpolating F, but
 +     * that comes at a cost.
 +     * A better hack is to remove the net force every
 +     * step, but that must be done at a higher level
 +     * since this routine doesn't see all atoms if running
 +     * in parallel. Don't know how important it is?  EL 990726
 +     */
 +}
 +
 +
 +static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
 +                                   pme_atomcomm_t *atc)
 +{
 +    splinedata_t *spline;
 +    int     n, ithx, ithy, ithz, i0, j0, k0;
 +    int     index_x, index_xy;
 +    int *   idxptr;
 +    real    energy, pot, tx, ty, coefficient, gval;
 +    real    *thx, *thy, *thz;
 +    int     norder;
 +    int     order;
 +
 +    spline = &atc->spline[0];
 +
 +    order = pme->pme_order;
 +
 +    energy = 0;
 +    for (n = 0; (n < atc->n); n++)
 +    {
 +        coefficient      = atc->coefficient[n];
 +
 +        if (coefficient != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = n*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next three statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +
 +            pot = 0;
 +            for (ithx = 0; (ithx < order); ithx++)
 +            {
 +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 +                tx      = thx[ithx];
 +
 +                for (ithy = 0; (ithy < order); ithy++)
 +                {
 +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 +                    ty       = thy[ithy];
 +
 +                    for (ithz = 0; (ithz < order); ithz++)
 +                    {
 +                        gval  = grid[index_xy+(k0+ithz)];
 +                        pot  += tx*ty*thz[ithz]*gval;
 +                    }
 +
 +                }
 +            }
 +
 +            energy += pot*coefficient;
 +        }
 +    }
 +
 +    return energy;
 +}
 +
 +/* Macro to force loop unrolling by fixing order.
 + * This gives a significant performance gain.
 + */
 +#define CALC_SPLINE(order)                     \
 +    {                                              \
 +        int j, k, l;                                 \
 +        real dr, div;                               \
 +        real data[PME_ORDER_MAX];                  \
 +        real ddata[PME_ORDER_MAX];                 \
 +                                               \
 +        for (j = 0; (j < DIM); j++)                     \
 +        {                                          \
 +            dr  = xptr[j];                         \
 +                                               \
 +            /* dr is relative offset from lower cell limit */ \
 +            data[order-1] = 0;                     \
 +            data[1]       = dr;                          \
 +            data[0]       = 1 - dr;                      \
 +                                               \
 +            for (k = 3; (k < order); k++)               \
 +            {                                      \
 +                div       = 1.0/(k - 1.0);               \
 +                data[k-1] = div*dr*data[k-2];      \
 +                for (l = 1; (l < (k-1)); l++)           \
 +                {                                  \
 +                    data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 +                                       data[k-l-1]);                \
 +                }                                  \
 +                data[0] = div*(1-dr)*data[0];      \
 +            }                                      \
 +            /* differentiate */                    \
 +            ddata[0] = -data[0];                   \
 +            for (k = 1; (k < order); k++)               \
 +            {                                      \
 +                ddata[k] = data[k-1] - data[k];    \
 +            }                                      \
 +                                               \
 +            div           = 1.0/(order - 1);                 \
 +            data[order-1] = div*dr*data[order-2];  \
 +            for (l = 1; (l < (order-1)); l++)           \
 +            {                                      \
 +                data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 +                                       (order-l-dr)*data[order-l-1]); \
 +            }                                      \
 +            data[0] = div*(1 - dr)*data[0];        \
 +                                               \
 +            for (k = 0; k < order; k++)                 \
 +            {                                      \
 +                theta[j][i*order+k]  = data[k];    \
 +                dtheta[j][i*order+k] = ddata[k];   \
 +            }                                      \
 +        }                                          \
 +    }
 +
 +void make_bsplines(splinevec theta, splinevec dtheta, int order,
 +                   rvec fractx[], int nr, int ind[], real coefficient[],
 +                   gmx_bool bDoSplines)
 +{
 +    /* construct splines for local atoms */
 +    int  i, ii;
 +    real *xptr;
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        /* With free energy we do not use the coefficient check.
 +         * In most cases this will be more efficient than calling make_bsplines
 +         * twice, since usually more than half the particles have non-zero coefficients.
 +         */
 +        ii = ind[i];
 +        if (bDoSplines || coefficient[ii] != 0.0)
 +        {
 +            xptr = fractx[ii];
 +            switch (order)
 +            {
 +                case 4:  CALC_SPLINE(4);     break;
 +                case 5:  CALC_SPLINE(5);     break;
 +                default: CALC_SPLINE(order); break;
 +            }
 +        }
 +    }
 +}
 +
 +
 +void make_dft_mod(real *mod, real *data, int ndata)
 +{
 +    int i, j;
 +    real sc, ss, arg;
 +
 +    for (i = 0; i < ndata; i++)
 +    {
 +        sc = ss = 0;
 +        for (j = 0; j < ndata; j++)
 +        {
 +            arg = (2.0*M_PI*i*j)/ndata;
 +            sc += data[j]*cos(arg);
 +            ss += data[j]*sin(arg);
 +        }
 +        mod[i] = sc*sc+ss*ss;
 +    }
 +    for (i = 0; i < ndata; i++)
 +    {
 +        if (mod[i] < 1e-7)
 +        {
 +            mod[i] = (mod[i-1]+mod[i+1])*0.5;
 +        }
 +    }
 +}
 +
 +
 +static void make_bspline_moduli(splinevec bsp_mod,
 +                                int nx, int ny, int nz, int order)
 +{
 +    int nmax = max(nx, max(ny, nz));
 +    real *data, *ddata, *bsp_data;
 +    int i, k, l;
 +    real div;
 +
 +    snew(data, order);
 +    snew(ddata, order);
 +    snew(bsp_data, nmax);
 +
 +    data[order-1] = 0;
 +    data[1]       = 0;
 +    data[0]       = 1;
 +
 +    for (k = 3; k < order; k++)
 +    {
 +        div       = 1.0/(k-1.0);
 +        data[k-1] = 0;
 +        for (l = 1; l < (k-1); l++)
 +        {
 +            data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
 +        }
 +        data[0] = div*data[0];
 +    }
 +    /* differentiate */
 +    ddata[0] = -data[0];
 +    for (k = 1; k < order; k++)
 +    {
 +        ddata[k] = data[k-1]-data[k];
 +    }
 +    div           = 1.0/(order-1);
 +    data[order-1] = 0;
 +    for (l = 1; l < (order-1); l++)
 +    {
 +        data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
 +    }
 +    data[0] = div*data[0];
 +
 +    for (i = 0; i < nmax; i++)
 +    {
 +        bsp_data[i] = 0;
 +    }
 +    for (i = 1; i <= order; i++)
 +    {
 +        bsp_data[i] = data[i-1];
 +    }
 +
 +    make_dft_mod(bsp_mod[XX], bsp_data, nx);
 +    make_dft_mod(bsp_mod[YY], bsp_data, ny);
 +    make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
 +
 +    sfree(data);
 +    sfree(ddata);
 +    sfree(bsp_data);
 +}
 +
 +
 +/* Return the P3M optimal influence function */
 +static double do_p3m_influence(double z, int order)
 +{
 +    double z2, z4;
 +
 +    z2 = z*z;
 +    z4 = z2*z2;
 +
 +    /* The formula and most constants can be found in:
 +     * Ballenegger et al., JCTC 8, 936 (2012)
 +     */
 +    switch (order)
 +    {
 +        case 2:
 +            return 1.0 - 2.0*z2/3.0;
 +            break;
 +        case 3:
 +            return 1.0 - z2 + 2.0*z4/15.0;
 +            break;
 +        case 4:
 +            return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
 +            break;
 +        case 5:
 +            return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
 +            break;
 +        case 6:
 +            return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
 +            break;
 +        case 7:
 +            return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
 +        case 8:
 +            return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
 +            break;
 +    }
 +
 +    return 0.0;
 +}
 +
 +/* Calculate the P3M B-spline moduli for one dimension */
 +static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
 +{
 +    double zarg, zai, sinzai, infl;
 +    int    maxk, i;
 +
 +    if (order > 8)
 +    {
 +        gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
 +    }
 +
 +    zarg = M_PI/n;
 +
 +    maxk = (n + 1)/2;
 +
 +    for (i = -maxk; i < 0; i++)
 +    {
 +        zai          = zarg*i;
 +        sinzai       = sin(zai);
 +        infl         = do_p3m_influence(sinzai, order);
 +        bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
 +    }
 +    bsp_mod[0] = 1.0;
 +    for (i = 1; i < maxk; i++)
 +    {
 +        zai        = zarg*i;
 +        sinzai     = sin(zai);
 +        infl       = do_p3m_influence(sinzai, order);
 +        bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
 +    }
 +}
 +
 +/* Calculate the P3M B-spline moduli */
 +static void make_p3m_bspline_moduli(splinevec bsp_mod,
 +                                    int nx, int ny, int nz, int order)
 +{
 +    make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
 +}
 +
 +
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +    int nslab, n, i;
 +    int fw, bw;
 +
 +    nslab = atc->nslab;
 +
 +    n = 0;
 +    for (i = 1; i <= nslab/2; i++)
 +    {
 +        fw = (atc->nodeid + i) % nslab;
 +        bw = (atc->nodeid - i + nslab) % nslab;
 +        if (n < nslab - 1)
 +        {
 +            atc->node_dest[n] = fw;
 +            atc->node_src[n]  = bw;
 +            n++;
 +        }
 +        if (n < nslab - 1)
 +        {
 +            atc->node_dest[n] = bw;
 +            atc->node_src[n]  = fw;
 +            n++;
 +        }
 +    }
 +}
 +
 +int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
 +{
 +    int thread, i;
 +
 +    if (NULL != log)
 +    {
 +        fprintf(log, "Destroying PME data structures.\n");
 +    }
 +
 +    sfree((*pmedata)->nnx);
 +    sfree((*pmedata)->nny);
 +    sfree((*pmedata)->nnz);
 +
 +    for (i = 0; i < (*pmedata)->ngrids; ++i)
 +    {
 +        pmegrids_destroy(&(*pmedata)->pmegrid[i]);
 +        sfree((*pmedata)->fftgrid[i]);
 +        sfree((*pmedata)->cfftgrid[i]);
 +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
 +    }
 +
 +    sfree((*pmedata)->lb_buf1);
 +    sfree((*pmedata)->lb_buf2);
 +
 +    for (thread = 0; thread < (*pmedata)->nthread; thread++)
 +    {
 +        free_work(&(*pmedata)->work[thread]);
 +    }
 +    sfree((*pmedata)->work);
 +
 +    sfree(*pmedata);
 +    *pmedata = NULL;
 +
 +    return 0;
 +}
 +
 +static int mult_up(int n, int f)
 +{
 +    return ((n + f - 1)/f)*f;
 +}
 +
 +
 +static double pme_load_imbalance(gmx_pme_t pme)
 +{
 +    int    nma, nmi;
 +    double n1, n2, n3;
 +
 +    nma = pme->nnodes_major;
 +    nmi = pme->nnodes_minor;
 +
 +    n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 +    n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 +    n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 +
 +    /* pme_solve is roughly double the cost of an fft */
 +
 +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 +}
 +
 +static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                          int dimind, gmx_bool bSpread)
 +{
 +    int nk, k, s, thread;
 +
 +    atc->dimind    = dimind;
 +    atc->nslab     = 1;
 +    atc->nodeid    = 0;
 +    atc->pd_nalloc = 0;
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        atc->mpi_comm = pme->mpi_comm_d[dimind];
 +        MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 +        MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 +    }
 +#endif
 +
 +    atc->bSpread   = bSpread;
 +    atc->pme_order = pme->pme_order;
 +
 +    if (atc->nslab > 1)
 +    {
 +        snew(atc->node_dest, atc->nslab);
 +        snew(atc->node_src, atc->nslab);
 +        setup_coordinate_communication(atc);
 +
 +        snew(atc->count_thread, pme->nthread);
 +        for (thread = 0; thread < pme->nthread; thread++)
 +        {
 +            snew(atc->count_thread[thread], atc->nslab);
 +        }
 +        atc->count = atc->count_thread[0];
 +        snew(atc->rcount, atc->nslab);
 +        snew(atc->buf_index, atc->nslab);
 +    }
 +
 +    atc->nthread = pme->nthread;
 +    if (atc->nthread > 1)
 +    {
 +        snew(atc->thread_plist, atc->nthread);
 +    }
 +    snew(atc->spline, atc->nthread);
 +    for (thread = 0; thread < atc->nthread; thread++)
 +    {
 +        if (atc->nthread > 1)
 +        {
 +            snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
 +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
 +        }
 +        snew(atc->spline[thread].thread_one, pme->nthread);
 +        atc->spline[thread].thread_one[thread] = 1;
 +    }
 +}
 +
 +static void
 +init_overlap_comm(pme_overlap_t *  ol,
 +                  int              norder,
 +#ifdef GMX_MPI
 +                  MPI_Comm         comm,
 +#endif
 +                  int              nnodes,
 +                  int              nodeid,
 +                  int              ndata,
 +                  int              commplainsize)
 +{
 +    int lbnd, rbnd, maxlr, b, i;
 +    int exten;
 +    int nn, nk;
 +    pme_grid_comm_t *pgc;
 +    gmx_bool bCont;
 +    int fft_start, fft_end, send_index1, recv_index1;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +
 +    ol->mpi_comm = comm;
 +#endif
 +
 +    ol->nnodes = nnodes;
 +    ol->nodeid = nodeid;
 +
 +    /* Linear translation of the PME grid won't affect reciprocal space
 +     * calculations, so to optimize we only interpolate "upwards",
 +     * which also means we only have to consider overlap in one direction.
 +     * I.e., particles on this node might also be spread to grid indices
 +     * that belong to higher nodes (modulo nnodes)
 +     */
 +
 +    snew(ol->s2g0, ol->nnodes+1);
 +    snew(ol->s2g1, ol->nnodes);
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab boundaries:");
 +    }
 +    for (i = 0; i < nnodes; i++)
 +    {
 +        /* s2g0 the local interpolation grid start.
 +         * s2g1 the local interpolation grid end.
-     ivec ne;
++         * Since in calc_pidx we divide particles, and not grid lines,
++         * spatially uniform along dimension x or y, we need to round
++         * s2g0 down and s2g1 up.
 +         */
 +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 +        }
 +    }
 +    ol->s2g0[nnodes] = ndata;
 +    if (debug)
 +    {
 +        fprintf(debug, "\n");
 +    }
 +
 +    /* Determine with how many nodes we need to communicate the grid overlap */
 +    b = 0;
 +    do
 +    {
 +        b++;
 +        bCont = FALSE;
 +        for (i = 0; i < nnodes; i++)
 +        {
 +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 +            {
 +                bCont = TRUE;
 +            }
 +        }
 +    }
 +    while (bCont && b < nnodes);
 +    ol->noverlap_nodes = b - 1;
 +
 +    snew(ol->send_id, ol->noverlap_nodes);
 +    snew(ol->recv_id, ol->noverlap_nodes);
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 +    }
 +    snew(ol->comm_data, ol->noverlap_nodes);
 +
 +    ol->send_size = 0;
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        pgc = &ol->comm_data[b];
 +        /* Send */
 +        fft_start        = ol->s2g0[ol->send_id[b]];
 +        fft_end          = ol->s2g0[ol->send_id[b]+1];
 +        if (ol->send_id[b] < nodeid)
 +        {
 +            fft_start += ndata;
 +            fft_end   += ndata;
 +        }
 +        send_index1       = ol->s2g1[nodeid];
 +        send_index1       = min(send_index1, fft_end);
 +        pgc->send_index0  = fft_start;
 +        pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
 +        ol->send_size    += pgc->send_nindex;
 +
 +        /* We always start receiving to the first index of our slab */
 +        fft_start        = ol->s2g0[ol->nodeid];
 +        fft_end          = ol->s2g0[ol->nodeid+1];
 +        recv_index1      = ol->s2g1[ol->recv_id[b]];
 +        if (ol->recv_id[b] > nodeid)
 +        {
 +            recv_index1 -= ndata;
 +        }
 +        recv_index1      = min(recv_index1, fft_end);
 +        pgc->recv_index0 = fft_start;
 +        pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
 +    }
 +
 +#ifdef GMX_MPI
 +    /* Communicate the buffer sizes to receive */
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 +                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 +                     ol->mpi_comm, &stat);
 +    }
 +#endif
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    snew(ol->sendbuf, norder*commplainsize);
 +    snew(ol->recvbuf, norder*commplainsize);
 +}
 +
 +static void
 +make_gridindex5_to_localindex(int n, int local_start, int local_range,
 +                              int **global_to_local,
 +                              real **fraction_shift)
 +{
 +    int i;
 +    int * gtl;
 +    real * fsh;
 +
 +    snew(gtl, 5*n);
 +    snew(fsh, 5*n);
 +    for (i = 0; (i < 5*n); i++)
 +    {
 +        /* Determine the global to local grid index */
 +        gtl[i] = (i - local_start + n) % n;
 +        /* For coordinates that fall within the local grid the fraction
 +         * is correct, we don't need to shift it.
 +         */
 +        fsh[i] = 0;
 +        if (local_range < n)
 +        {
 +            /* Due to rounding issues i could be 1 beyond the lower or
 +             * upper boundary of the local grid. Correct the index for this.
 +             * If we shift the index, we need to shift the fraction by
 +             * the same amount in the other direction to not affect
 +             * the weights.
 +             * Note that due to this shifting the weights at the end of
 +             * the spline might change, but that will only involve values
 +             * between zero and values close to the precision of a real,
 +             * which is anyhow the accuracy of the whole mesh calculation.
 +             */
 +            /* With local_range=0 we should not change i=local_start */
 +            if (i % n != local_start)
 +            {
 +                if (gtl[i] == n-1)
 +                {
 +                    gtl[i] = 0;
 +                    fsh[i] = -1;
 +                }
 +                else if (gtl[i] == local_range)
 +                {
 +                    gtl[i] = local_range - 1;
 +                    fsh[i] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    *global_to_local = gtl;
 +    *fraction_shift  = fsh;
 +}
 +
 +static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
 +{
 +    pme_spline_work_t *work;
 +
 +#ifdef PME_SIMD4_SPREAD_GATHER
 +    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
 +    gmx_simd4_real_t zero_S;
 +    gmx_simd4_real_t real_mask_S0, real_mask_S1;
 +    int              of, i;
 +
 +    snew_aligned(work, 1, SIMD4_ALIGNMENT);
 +
 +    tmp_aligned = gmx_simd4_align_r(tmp);
 +
 +    zero_S = gmx_simd4_setzero_r();
 +
 +    /* Generate bit masks to mask out the unused grid entries,
 +     * as we only operate on order of the 8 grid entries that are
 +     * load into 2 SIMD registers.
 +     */
 +    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
 +    {
 +        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
 +        {
 +            tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
 +        }
 +        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
 +        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
 +        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
 +        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
 +    }
 +#else
 +    work = NULL;
 +#endif
 +
 +    return work;
 +}
 +
 +void gmx_pme_check_restrictions(int pme_order,
 +                                int nkx, int nky, int nkz,
 +                                int nnodes_major,
 +                                int nnodes_minor,
 +                                gmx_bool bUseThreads,
 +                                gmx_bool bFatal,
 +                                gmx_bool *bValidSettings)
 +{
 +    if (pme_order > PME_ORDER_MAX)
 +    {
 +        if (!bFatal)
 +        {
 +            *bValidSettings = FALSE;
 +            return;
 +        }
 +        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 +                  pme_order, PME_ORDER_MAX);
 +    }
 +
 +    if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
 +        nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
 +        nkz <= pme_order)
 +    {
 +        if (!bFatal)
 +        {
 +            *bValidSettings = FALSE;
 +            return;
 +        }
 +        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
 +                  pme_order);
 +    }
 +
 +    /* Check for a limitation of the (current) sum_fftgrid_dd code.
 +     * We only allow multiple communication pulses in dim 1, not in dim 0.
 +     */
 +    if (bUseThreads && (nkx < nnodes_major*pme_order &&
 +                        nkx != nnodes_major*(pme_order - 1)))
 +    {
 +        if (!bFatal)
 +        {
 +            *bValidSettings = FALSE;
 +            return;
 +        }
 +        gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
 +                  nkx/(double)nnodes_major, pme_order);
 +    }
 +
 +    if (bValidSettings != NULL)
 +    {
 +        *bValidSettings = TRUE;
 +    }
 +
 +    return;
 +}
 +
 +int gmx_pme_init(gmx_pme_t *         pmedata,
 +                 t_commrec *         cr,
 +                 int                 nnodes_major,
 +                 int                 nnodes_minor,
 +                 t_inputrec *        ir,
 +                 int                 homenr,
 +                 gmx_bool            bFreeEnergy_q,
 +                 gmx_bool            bFreeEnergy_lj,
 +                 gmx_bool            bReproducible,
 +                 int                 nthread)
 +{
 +    gmx_pme_t pme = NULL;
 +
 +    int  use_threads, sum_use_threads, i;
 +    ivec ndata;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Creating PME data structures.\n");
 +    }
 +    snew(pme, 1);
 +
 +    pme->sum_qgrid_tmp       = NULL;
 +    pme->sum_qgrid_dd_tmp    = NULL;
 +    pme->buf_nalloc          = 0;
 +
 +    pme->nnodes              = 1;
 +    pme->bPPnode             = TRUE;
 +
 +    pme->nnodes_major        = nnodes_major;
 +    pme->nnodes_minor        = nnodes_minor;
 +
 +#ifdef GMX_MPI
 +    if (nnodes_major*nnodes_minor > 1)
 +    {
 +        pme->mpi_comm = cr->mpi_comm_mygroup;
 +
 +        MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 +        MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 +        if (pme->nnodes != nnodes_major*nnodes_minor)
 +        {
 +            gmx_incons("PME rank count mismatch");
 +        }
 +    }
 +    else
 +    {
 +        pme->mpi_comm = MPI_COMM_NULL;
 +    }
 +#endif
 +
 +    if (pme->nnodes == 1)
 +    {
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +        pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +        pme->ndecompdim   = 0;
 +        pme->nodeid_major = 0;
 +        pme->nodeid_minor = 0;
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +    }
 +    else
 +    {
 +        if (nnodes_minor == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = pme->mpi_comm;
 +            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +            pme->ndecompdim   = 1;
 +            pme->nodeid_major = pme->nodeid;
 +            pme->nodeid_minor = 0;
 +
 +        }
 +        else if (nnodes_major == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +            pme->mpi_comm_d[1] = pme->mpi_comm;
 +#endif
 +            pme->ndecompdim   = 1;
 +            pme->nodeid_major = 0;
 +            pme->nodeid_minor = pme->nodeid;
 +        }
 +        else
 +        {
 +            if (pme->nnodes % nnodes_major != 0)
 +            {
 +                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
 +            }
 +            pme->ndecompdim = 2;
 +
 +#ifdef GMX_MPI
 +            MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 +                           pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 +            MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 +                           pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 +
 +            MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 +            MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 +            MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 +            MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 +#endif
 +        }
 +        pme->bPPnode = (cr->duty & DUTY_PP);
 +    }
 +
 +    pme->nthread = nthread;
 +
 +    /* Check if any of the PME MPI ranks uses threads */
 +    use_threads = (pme->nthread > 1 ? 1 : 0);
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
 +                      MPI_SUM, pme->mpi_comm);
 +    }
 +    else
 +#endif
 +    {
 +        sum_use_threads = use_threads;
 +    }
 +    pme->bUseThreads = (sum_use_threads > 0);
 +
 +    if (ir->ePBC == epbcSCREW)
 +    {
 +        gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 +    }
 +
 +    pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
 +    pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
 +    pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
 +    pme->nkx         = ir->nkx;
 +    pme->nky         = ir->nky;
 +    pme->nkz         = ir->nkz;
 +    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 +    pme->pme_order   = ir->pme_order;
 +
 +    /* Always constant electrostatics coefficients */
 +    pme->epsilon_r   = ir->epsilon_r;
 +
 +    /* Always constant LJ coefficients */
 +    pme->ljpme_combination_rule = ir->ljpme_combination_rule;
 +
 +    /* If we violate restrictions, generate a fatal error here */
 +    gmx_pme_check_restrictions(pme->pme_order,
 +                               pme->nkx, pme->nky, pme->nkz,
 +                               pme->nnodes_major,
 +                               pme->nnodes_minor,
 +                               pme->bUseThreads,
 +                               TRUE,
 +                               NULL);
 +
 +    if (pme->nnodes > 1)
 +    {
 +        double imbal;
 +
 +#ifdef GMX_MPI
 +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 +        MPI_Type_commit(&(pme->rvec_mpi));
 +#endif
 +
 +        /* Note that the coefficient spreading and force gathering, which usually
 +         * takes about the same amount of time as FFT+solve_pme,
 +         * is always fully load balanced
 +         * (unless the coefficient distribution is inhomogeneous).
 +         */
 +
 +        imbal = pme_load_imbalance(pme);
 +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 +                    "      For optimal PME load balancing\n"
 +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
 +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
 +                    "\n",
 +                    (int)((imbal-1)*100 + 0.5),
 +                    pme->nkx, pme->nky, pme->nnodes_major,
 +                    pme->nky, pme->nkz, pme->nnodes_minor);
 +        }
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 +     * y is always copied through a buffer: we don't need padding in z,
 +     * but we do need the overlap in x because of the communication order.
 +     */
 +    init_overlap_comm(&pme->overlap[0], pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[0],
 +#endif
 +                      pme->nnodes_major, pme->nodeid_major,
 +                      pme->nkx,
 +                      (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 +
 +    /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 +     * We do this with an offset buffer of equal size, so we need to allocate
 +     * extra for the offset. That's what the (+1)*pme->nkz is for.
 +     */
 +    init_overlap_comm(&pme->overlap[1], pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[1],
 +#endif
 +                      pme->nnodes_minor, pme->nodeid_minor,
 +                      pme->nky,
 +                      (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 +
 +    /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
 +     * Note that gmx_pme_check_restrictions checked for this already.
 +     */
 +    if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
 +    {
 +        gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
 +    }
 +
 +    snew(pme->bsp_mod[XX], pme->nkx);
 +    snew(pme->bsp_mod[YY], pme->nky);
 +    snew(pme->bsp_mod[ZZ], pme->nkz);
 +
 +    /* The required size of the interpolation grid, including overlap.
 +     * The allocated size (pmegrid_n?) might be slightly larger.
 +     */
 +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 +        pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 +        pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_nz_base = pme->nkz;
 +    pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 +    set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 +
 +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_start_iz = 0;
 +
 +    make_gridindex5_to_localindex(pme->nkx,
 +                                  pme->pmegrid_start_ix,
 +                                  pme->pmegrid_nx - (pme->pme_order-1),
 +                                  &pme->nnx, &pme->fshx);
 +    make_gridindex5_to_localindex(pme->nky,
 +                                  pme->pmegrid_start_iy,
 +                                  pme->pmegrid_ny - (pme->pme_order-1),
 +                                  &pme->nny, &pme->fshy);
 +    make_gridindex5_to_localindex(pme->nkz,
 +                                  pme->pmegrid_start_iz,
 +                                  pme->pmegrid_nz_base,
 +                                  &pme->nnz, &pme->fshz);
 +
 +    pme->spline_work = make_pme_spline_work(pme->pme_order);
 +
 +    ndata[0]    = pme->nkx;
 +    ndata[1]    = pme->nky;
 +    ndata[2]    = pme->nkz;
 +    /* It doesn't matter if we allocate too many grids here,
 +     * we only allocate and use the ones we need.
 +     */
 +    if (EVDW_PME(ir->vdwtype))
 +    {
 +        pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
 +    }
 +    else
 +    {
 +        pme->ngrids = DO_Q;
 +    }
 +    snew(pme->fftgrid, pme->ngrids);
 +    snew(pme->cfftgrid, pme->ngrids);
 +    snew(pme->pfft_setup, pme->ngrids);
 +
 +    for (i = 0; i < pme->ngrids; ++i)
 +    {
 +        if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
 +                                                       bFreeEnergy_q)) ||
 +            (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
 +                                                    bFreeEnergy_lj ||
 +                                                    ir->ljpme_combination_rule == eljpmeLB)))
 +        {
 +            pmegrids_init(&pme->pmegrid[i],
 +                          pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 +                          pme->pmegrid_nz_base,
 +                          pme->pme_order,
 +                          pme->bUseThreads,
 +                          pme->nthread,
 +                          pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 +                          pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 +            /* This routine will allocate the grid data to fit the FFTs */
 +            gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
 +                                    &pme->fftgrid[i], &pme->cfftgrid[i],
 +                                    pme->mpi_comm_d,
 +                                    bReproducible, pme->nthread);
 +
 +        }
 +    }
 +
 +    if (!pme->bP3M)
 +    {
 +        /* Use plain SPME B-spline interpolation */
 +        make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 +    }
 +    else
 +    {
 +        /* Use the P3M grid-optimized influence function */
 +        make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 +    }
 +
 +    /* Use atc[0] for spreading */
 +    init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
 +    if (pme->ndecompdim >= 2)
 +    {
 +        init_atomcomm(pme, &pme->atc[1], 1, FALSE);
 +    }
 +
 +    if (pme->nnodes == 1)
 +    {
 +        pme->atc[0].n = homenr;
 +        pme_realloc_atomcomm_things(&pme->atc[0]);
 +    }
 +
 +    pme->lb_buf1       = NULL;
 +    pme->lb_buf2       = NULL;
 +    pme->lb_buf_nalloc = 0;
 +
 +    {
 +        int thread;
 +
 +        /* Use fft5d, order after FFT is y major, z, x minor */
 +
 +        snew(pme->work, pme->nthread);
 +        for (thread = 0; thread < pme->nthread; thread++)
 +        {
 +            realloc_work(&pme->work[thread], pme->nkx);
 +        }
 +    }
 +
 +    *pmedata = pme;
 +
 +    return 0;
 +}
 +
 +static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
 +{
 +    int d, t;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        if (new->grid.n[d] > old->grid.n[d])
 +        {
 +            return;
 +        }
 +    }
 +
 +    sfree_aligned(new->grid.grid);
 +    new->grid.grid = old->grid.grid;
 +
 +    if (new->grid_th != NULL && new->nthread == old->nthread)
 +    {
 +        sfree_aligned(new->grid_all);
 +        for (t = 0; t < new->nthread; t++)
 +        {
 +            new->grid_th[t].grid = old->grid_th[t].grid;
 +        }
 +    }
 +}
 +
 +int gmx_pme_reinit(gmx_pme_t *         pmedata,
 +                   t_commrec *         cr,
 +                   gmx_pme_t           pme_src,
 +                   const t_inputrec *  ir,
 +                   ivec                grid_size)
 +{
 +    t_inputrec irc;
 +    int homenr;
 +    int ret;
 +
 +    irc     = *ir;
 +    irc.nkx = grid_size[XX];
 +    irc.nky = grid_size[YY];
 +    irc.nkz = grid_size[ZZ];
 +
 +    if (pme_src->nnodes == 1)
 +    {
 +        homenr = pme_src->atc[0].n;
 +    }
 +    else
 +    {
 +        homenr = -1;
 +    }
 +
 +    ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 +                       &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
 +
 +    if (ret == 0)
 +    {
 +        /* We can easily reuse the allocated pme grids in pme_src */
 +        reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
 +        /* We would like to reuse the fft grids, but that's harder */
 +    }
 +
 +    return ret;
 +}
 +
 +
 +static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
 +                            int grid_index, int thread, real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    int  fft_my, fft_mz;
 +    int  nsx, nsy, nsz;
 +    ivec nf;
 +    int  offx, offy, offz, x, y, z, i0, i0t;
 +    int  d;
 +    pmegrid_t *pmegrid;
 +    real *grid_th;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    nsx = pmegrid->s[XX];
 +    nsy = pmegrid->s[YY];
 +    nsz = pmegrid->s[ZZ];
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
 +                    local_fft_ndata[d] - pmegrid->offset[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    /* Directly copy the non-overlapping parts of the local grids.
 +     * This also initializes the full grid.
 +     */
 +    grid_th = pmegrid->grid;
 +    for (x = 0; x < nf[XX]; x++)
 +    {
 +        for (y = 0; y < nf[YY]; y++)
 +        {
 +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 +            i0t = (x*nsy + y)*nsz;
 +            for (z = 0; z < nf[ZZ]; z++)
 +            {
 +                fftgrid[i0+z] = grid_th[i0t+z];
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +reduce_threadgrid_overlap(gmx_pme_t pme,
 +                          const pmegrids_t *pmegrids, int thread,
 +                          real *fftgrid, real *commbuf_x, real *commbuf_y,
 +                          int grid_index)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    int  fft_nx, fft_ny, fft_nz;
 +    int  fft_my, fft_mz;
 +    int  buf_my = -1;
 +    int  nsx, nsy, nsz;
-         ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
-                     local_fft_ndata[d]);
++    ivec localcopy_end;
 +    int  offx, offy, offz, x, y, z, i0, i0t;
 +    int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
 +    gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
 +    gmx_bool bCommX, bCommY;
 +    int  d;
 +    int  thread_f;
 +    const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
 +    const real *grid_th;
 +    real *commbuf = NULL;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_nx = local_fft_ndata[XX];
 +    fft_ny = local_fft_ndata[YY];
 +    fft_nz = local_fft_ndata[ZZ];
 +
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    /* This routine is called when all thread have finished spreading.
 +     * Here each thread sums grid contributions calculated by other threads
 +     * to the thread local grid volume.
 +     * To minimize the number of grid copying operations,
 +     * this routines sums immediately from the pmegrid to the fftgrid.
 +     */
 +
 +    /* Determine which part of the full node grid we should operate on,
 +     * this is our thread local part of the full grid.
 +     */
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    for (d = 0; d < DIM; d++)
 +    {
-         tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
++        /* Determine up to where our thread needs to copy from the
++         * thread-local charge spreading grid to the rank-local FFT grid.
++         * This is up to our spreading grid end minus order-1 and
++         * not beyond the local FFT grid.
++         */
++        localcopy_end[d] =
++            min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
++                local_fft_ndata[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +
 +    bClearBufX  = TRUE;
 +    bClearBufY  = TRUE;
 +    bClearBufXY = TRUE;
 +
 +    /* Now loop over all the thread data blocks that contribute
 +     * to the grid region we (our thread) are operating on.
 +     */
 +    /* Note that fft_nx/y is equal to the number of grid points
 +     * between the first point of our node grid and the one of the next node.
 +     */
 +    for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
 +    {
 +        fx     = pmegrid->ci[XX] + sx;
 +        ox     = 0;
 +        bCommX = FALSE;
 +        if (fx < 0)
 +        {
 +            fx    += pmegrids->nc[XX];
 +            ox    -= fft_nx;
 +            bCommX = (pme->nnodes_major > 1);
 +        }
 +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 +        ox       += pmegrid_g->offset[XX];
 +        /* Determine the end of our part of the source grid */
-             ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
++        if (!bCommX)
++        {
++            /* Use our thread local source grid and target grid part */
++            tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
++        }
++        else
++        {
++            /* Use our thread local source grid and the spreading range */
++            tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
++        }
 +
 +        for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
 +        {
 +            fy     = pmegrid->ci[YY] + sy;
 +            oy     = 0;
 +            bCommY = FALSE;
 +            if (fy < 0)
 +            {
 +                fy    += pmegrids->nc[YY];
 +                oy    -= fft_ny;
 +                bCommY = (pme->nnodes_minor > 1);
 +            }
 +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 +            oy       += pmegrid_g->offset[YY];
 +            /* Determine the end of our part of the source grid */
-                 tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
++            if (!bCommY)
++            {
++                /* Use our thread local source grid and target grid part */
++                ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
++            }
++            else
++            {
++                /* Use our thread local source grid and the spreading range */
++                ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
++            }
 +
 +            for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
 +            {
 +                fz = pmegrid->ci[ZZ] + sz;
 +                oz = 0;
 +                if (fz < 0)
 +                {
 +                    fz += pmegrids->nc[ZZ];
 +                    oz -= fft_nz;
 +                }
 +                pmegrid_g = &pmegrids->grid_th[fz];
 +                oz       += pmegrid_g->offset[ZZ];
-                         /* The y-size of the communication buffer is order-1 */
-                         buf_my  = pmegrid->order - 1;
++                tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 +
 +                if (sx == 0 && sy == 0 && sz == 0)
 +                {
 +                    /* We have already added our local contribution
 +                     * before calling this routine, so skip it here.
 +                     */
 +                    continue;
 +                }
 +
 +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 +
 +                pmegrid_f = &pmegrids->grid_th[thread_f];
 +
 +                grid_th = pmegrid_f->grid;
 +
 +                nsx = pmegrid_f->s[XX];
 +                nsy = pmegrid_f->s[YY];
 +                nsz = pmegrid_f->s[ZZ];
 +
 +#ifdef DEBUG_PME_REDUCE
 +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 +                       pme->nodeid, thread, thread_f,
 +                       pme->pmegrid_start_ix,
 +                       pme->pmegrid_start_iy,
 +                       pme->pmegrid_start_iz,
 +                       sx, sy, sz,
 +                       offx-ox, tx1-ox, offx, tx1,
 +                       offy-oy, ty1-oy, offy, ty1,
 +                       offz-oz, tz1-oz, offz, tz1);
 +#endif
 +
 +                if (!(bCommX || bCommY))
 +                {
 +                    /* Copy from the thread local grid to the node grid */
 +                    for (x = offx; x < tx1; x++)
 +                    {
 +                        for (y = offy; y < ty1; y++)
 +                        {
 +                            i0  = (x*fft_my + y)*fft_mz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +                            for (z = offz; z < tz1; z++)
 +                            {
 +                                fftgrid[i0+z] += grid_th[i0t+z];
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* The order of this conditional decides
 +                     * where the corner volume gets stored with x+y decomp.
 +                     */
 +                    if (bCommY)
 +                    {
 +                        commbuf = commbuf_y;
-             fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
++                        /* The y-size of the communication buffer is set by
++                         * the overlap of the grid part of our local slab
++                         * with the part starting at the next slab.
++                         */
++                        buf_my  =
++                            pme->overlap[1].s2g1[pme->nodeid_minor] -
++                            pme->overlap[1].s2g0[pme->nodeid_minor+1];
 +                        if (bCommX)
 +                        {
 +                            /* We index commbuf modulo the local grid size */
 +                            commbuf += buf_my*fft_nx*fft_nz;
 +
 +                            bClearBuf   = bClearBufXY;
 +                            bClearBufXY = FALSE;
 +                        }
 +                        else
 +                        {
 +                            bClearBuf  = bClearBufY;
 +                            bClearBufY = FALSE;
 +                        }
 +                    }
 +                    else
 +                    {
 +                        commbuf    = commbuf_x;
 +                        buf_my     = fft_ny;
 +                        bClearBuf  = bClearBufX;
 +                        bClearBufX = FALSE;
 +                    }
 +
 +                    /* Copy to the communication buffer */
 +                    for (x = offx; x < tx1; x++)
 +                    {
 +                        for (y = offy; y < ty1; y++)
 +                        {
 +                            i0  = (x*buf_my + y)*fft_nz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +
 +                            if (bClearBuf)
 +                            {
 +                                /* First access of commbuf, initialize it */
 +                                for (z = offz; z < tz1; z++)
 +                                {
 +                                    commbuf[i0+z]  = grid_th[i0t+z];
 +                                }
 +                            }
 +                            else
 +                            {
 +                                for (z = offz; z < tz1; z++)
 +                                {
 +                                    commbuf[i0+z] += grid_th[i0t+z];
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    pme_overlap_t *overlap;
 +    int  send_index0, send_nindex;
 +    int  recv_nindex;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +#endif
 +    int  send_size_y, recv_size_y;
 +    int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
 +    real *sendptr, *recvptr;
 +    int  x, y, z, indg, indb;
 +
 +    /* Note that this routine is only used for forward communication.
 +     * Since the force gathering, unlike the coefficient spreading,
 +     * can be trivially parallelized over the particles,
 +     * the backwards process is much simpler and can use the "old"
 +     * communication setup.
 +     */
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    if (pme->nnodes_minor > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[1];
 +
 +        if (pme->nnodes_major > 1)
 +        {
 +            size_yx = pme->overlap[0].comm_data[0].send_nindex;
 +        }
 +        else
 +        {
 +            size_yx = 0;
 +        }
 +        datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
 +
 +        send_size_y = overlap->send_size;
 +
 +        for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   =
 +                overlap->comm_data[ipulse].send_index0 -
 +                overlap->comm_data[0].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            /* We don't use recv_index0, as we always receive starting at 0 */
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_size_y   = overlap->comm_data[ipulse].recv_size;
 +
 +            sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
 +            recvptr = overlap->recvbuf;
 +
++            if (debug != NULL)
++            {
++                fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
++                        local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
++            }
++
 +#ifdef GMX_MPI
 +            MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
 +                         send_id, ipulse,
 +                         recvptr, recv_size_y*datasize, GMX_MPI_REAL,
 +                         recv_id, ipulse,
 +                         overlap->mpi_comm, &stat);
 +#endif
 +
 +            for (x = 0; x < local_fft_ndata[XX]; x++)
 +            {
 +                for (y = 0; y < recv_nindex; y++)
 +                {
 +                    indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 +                    indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
 +                    for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                    {
 +                        fftgrid[indg+z] += recvptr[indb+z];
 +                    }
 +                }
 +            }
 +
 +            if (pme->nnodes_major > 1)
 +            {
 +                /* Copy from the received buffer to the send buffer for dim 0 */
 +                sendptr = pme->overlap[0].sendbuf;
 +                for (x = 0; x < size_yx; x++)
 +                {
 +                    for (y = 0; y < recv_nindex; y++)
 +                    {
 +                        indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                        indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
 +                        for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                        {
 +                            sendptr[indg+z] += recvptr[indb+z];
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* We only support a single pulse here.
 +     * This is not a severe limitation, as this code is only used
 +     * with OpenMP and with OpenMP the (PME) domains can be larger.
 +     */
 +    if (pme->nnodes_major > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[0];
 +
 +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id       = overlap->send_id[ipulse];
 +        recv_id       = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* We don't use recv_index0, as we always receive starting at 0 */
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        if (debug != NULL)
 +        {
++            fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
 +                    send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +#endif
 +
 +        for (x = 0; x < recv_nindex; x++)
 +        {
 +            for (y = 0; y < local_fft_ndata[YY]; y++)
 +            {
 +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void spread_on_grid(gmx_pme_t pme,
 +                           pme_atomcomm_t *atc, pmegrids_t *grids,
 +                           gmx_bool bCalcSplines, gmx_bool bSpread,
 +                           real *fftgrid, gmx_bool bDoSplines, int grid_index)
 +{
 +    int nthread, thread;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
 +    static double cs1     = 0, cs2 = 0, cs3 = 0;
 +    static double cs1a[6] = {0, 0, 0, 0, 0, 0};
 +    static int cnt        = 0;
 +#endif
 +
 +    nthread = pme->nthread;
 +    assert(nthread > 0);
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    if (bCalcSplines)
 +    {
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +        for (thread = 0; thread < nthread; thread++)
 +        {
 +            int start, end;
 +
 +            start = atc->n* thread   /nthread;
 +            end   = atc->n*(thread+1)/nthread;
 +
 +            /* Compute fftgrid index for all atoms,
 +             * with help of some extra variables.
 +             */
 +            calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c1   = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        splinedata_t *spline;
 +        pmegrid_t *grid = NULL;
 +
 +        /* make local bsplines  */
 +        if (grids == NULL || !pme->bUseThreads)
 +        {
 +            spline = &atc->spline[0];
 +
 +            spline->n = atc->n;
 +
 +            if (bSpread)
 +            {
 +                grid = &grids->grid;
 +            }
 +        }
 +        else
 +        {
 +            spline = &atc->spline[thread];
 +
 +            if (grids->nthread == 1)
 +            {
 +                /* One thread, we operate on all coefficients */
 +                spline->n = atc->n;
 +            }
 +            else
 +            {
 +                /* Get the indices our thread should operate on */
 +                make_thread_local_ind(atc, thread, spline);
 +            }
 +
 +            grid = &grids->grid_th[thread];
 +        }
 +
 +        if (bCalcSplines)
 +        {
 +            make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
 +                          atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
 +        }
 +
 +        if (bSpread)
 +        {
 +            /* put local atoms on grid. */
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_start();
 +#endif
 +            spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
 +
 +            if (pme->bUseThreads)
 +            {
 +                copy_local_grid(pme, grids, grid_index, thread, fftgrid);
 +            }
 +#ifdef PME_TIME_SPREAD
 +            ct1a          = omp_cyc_end(ct1a);
 +            cs1a[thread] += (double)ct1a;
 +#endif
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c2   = omp_cyc_end(c2);
 +    cs2 += (double)c2;
 +#endif
 +
 +    if (bSpread && pme->bUseThreads)
 +    {
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
 +        for (thread = 0; thread < grids->nthread; thread++)
 +        {
 +            reduce_threadgrid_overlap(pme, grids, thread,
 +                                      fftgrid,
 +                                      pme->overlap[0].sendbuf,
 +                                      pme->overlap[1].sendbuf,
 +                                      grid_index);
 +        }
 +#ifdef PME_TIME_THREADS
 +        c3   = omp_cyc_end(c3);
 +        cs3 += (double)c3;
 +#endif
 +
 +        if (pme->nnodes > 1)
 +        {
 +            /* Communicate the overlapping part of the fftgrid.
 +             * For this communication call we need to check pme->bUseThreads
 +             * to have all ranks communicate here, regardless of pme->nthread.
 +             */
 +            sum_fftgrid_dd(pme, fftgrid, grid_index);
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("idx %.2f spread %.2f red %.2f",
 +               cs1*1e-9, cs2*1e-9, cs3*1e-9);
 +#ifdef PME_TIME_SPREAD
 +        for (thread = 0; thread < nthread; thread++)
 +        {
 +            printf(" %.2f", cs1a[thread]*1e-9);
 +        }
 +#endif
 +        printf("\n");
 +    }
 +#endif
 +}
 +
 +
 +static void dump_grid(FILE *fp,
 +                      int sx, int sy, int sz, int nx, int ny, int nz,
 +                      int my, int mz, const real *g)
 +{
 +    int x, y, z;
 +
 +    for (x = 0; x < nx; x++)
 +    {
 +        for (y = 0; y < ny; y++)
 +        {
 +            for (z = 0; z < nz; z++)
 +            {
 +                fprintf(fp, "%2d %2d %2d %6.3f\n",
 +                        sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    dump_grid(stderr,
 +              pme->pmegrid_start_ix,
 +              pme->pmegrid_start_iy,
 +              pme->pmegrid_start_iz,
 +              pme->pmegrid_nx-pme->pme_order+1,
 +              pme->pmegrid_ny-pme->pme_order+1,
 +              pme->pmegrid_nz-pme->pme_order+1,
 +              local_fft_size[YY],
 +              local_fft_size[ZZ],
 +              fftgrid);
 +}
 +
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
 +{
 +    pme_atomcomm_t *atc;
 +    pmegrids_t *grid;
 +
 +    if (pme->nnodes > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy called in parallel");
 +    }
 +    if (pme->bFEP_q > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy with free energy");
 +    }
 +
 +    atc            = &pme->atc_energy;
 +    atc->nthread   = 1;
 +    if (atc->spline == NULL)
 +    {
 +        snew(atc->spline, atc->nthread);
 +    }
 +    atc->nslab     = 1;
 +    atc->bSpread   = TRUE;
 +    atc->pme_order = pme->pme_order;
 +    atc->n         = n;
 +    pme_realloc_atomcomm_things(atc);
 +    atc->x           = x;
 +    atc->coefficient = q;
 +
 +    /* We only use the A-charges grid */
 +    grid = &pme->pmegrid[PME_GRID_QA];
 +
 +    /* Only calculate the spline coefficients, don't actually spread */
 +    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 +
 +    *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 +}
 +
 +
 +static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
 +                                   gmx_walltime_accounting_t walltime_accounting,
 +                                   t_nrnb *nrnb, t_inputrec *ir,
 +                                   gmx_int64_t step)
 +{
 +    /* Reset all the counters related to performance over the run */
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    init_nrnb(nrnb);
 +    if (ir->nsteps >= 0)
 +    {
 +        /* ir->nsteps is not used here, but we update it for consistency */
 +        ir->nsteps -= step - ir->init_step;
 +    }
 +    ir->init_step = step;
 +    wallcycle_start(wcycle, ewcRUN);
 +    walltime_accounting_start(walltime_accounting);
 +}
 +
 +
 +static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
 +                               ivec grid_size,
 +                               t_commrec *cr, t_inputrec *ir,
 +                               gmx_pme_t *pme_ret)
 +{
 +    int ind;
 +    gmx_pme_t pme = NULL;
 +
 +    ind = 0;
 +    while (ind < *npmedata)
 +    {
 +        pme = (*pmedata)[ind];
 +        if (pme->nkx == grid_size[XX] &&
 +            pme->nky == grid_size[YY] &&
 +            pme->nkz == grid_size[ZZ])
 +        {
 +            *pme_ret = pme;
 +
 +            return;
 +        }
 +
 +        ind++;
 +    }
 +
 +    (*npmedata)++;
 +    srenew(*pmedata, *npmedata);
 +
 +    /* Generate a new PME data structure, copying part of the old pointers */
 +    gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
 +
 +    *pme_ret = (*pmedata)[ind];
 +}
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,    t_nrnb *mynrnb,
 +                gmx_wallcycle_t wcycle,
 +                gmx_walltime_accounting_t walltime_accounting,
 +                real ewaldcoeff_q, real ewaldcoeff_lj,
 +                t_inputrec *ir)
 +{
 +    int npmedata;
 +    gmx_pme_t *pmedata;
 +    gmx_pme_pp_t pme_pp;
 +    int  ret;
 +    int  natoms;
 +    matrix box;
 +    rvec *x_pp      = NULL, *f_pp = NULL;
 +    real *chargeA   = NULL, *chargeB = NULL;
 +    real *c6A       = NULL, *c6B = NULL;
 +    real *sigmaA    = NULL, *sigmaB = NULL;
 +    real lambda_q   = 0;
 +    real lambda_lj  = 0;
 +    int  maxshift_x = 0, maxshift_y = 0;
 +    real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
 +    matrix vir_q, vir_lj;
 +    float cycles;
 +    int  count;
 +    gmx_bool bEnerVir;
 +    int pme_flags;
 +    gmx_int64_t step, step_rel;
 +    ivec grid_switch;
 +
 +    /* This data will only use with PME tuning, i.e. switching PME grids */
 +    npmedata = 1;
 +    snew(pmedata, npmedata);
 +    pmedata[0] = pme;
 +
 +    pme_pp = gmx_pme_pp_init(cr);
 +
 +    init_nrnb(mynrnb);
 +
 +    count = 0;
 +    do /****** this is a quasi-loop over time steps! */
 +    {
 +        /* The reason for having a loop here is PME grid tuning/switching */
 +        do
 +        {
 +            /* Domain decomposition */
 +            ret = gmx_pme_recv_coeffs_coords(pme_pp,
 +                                             &natoms,
 +                                             &chargeA, &chargeB,
 +                                             &c6A, &c6B,
 +                                             &sigmaA, &sigmaB,
 +                                             box, &x_pp, &f_pp,
 +                                             &maxshift_x, &maxshift_y,
 +                                             &pme->bFEP_q, &pme->bFEP_lj,
 +                                             &lambda_q, &lambda_lj,
 +                                             &bEnerVir,
 +                                             &pme_flags,
 +                                             &step,
 +                                             grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
 +
 +            if (ret == pmerecvqxSWITCHGRID)
 +            {
 +                /* Switch the PME grid to grid_switch */
 +                gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
 +            }
 +
 +            if (ret == pmerecvqxRESETCOUNTERS)
 +            {
 +                /* Reset the cycle and flop counters */
 +                reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
 +            }
 +        }
 +        while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
 +
 +        if (ret == pmerecvqxFINISH)
 +        {
 +            /* We should stop: break out of the loop */
 +            break;
 +        }
 +
 +        step_rel = step - ir->init_step;
 +
 +        if (count == 0)
 +        {
 +            wallcycle_start(wcycle, ewcRUN);
 +            walltime_accounting_start(walltime_accounting);
 +        }
 +
 +        wallcycle_start(wcycle, ewcPMEMESH);
 +
 +        dvdlambda_q  = 0;
 +        dvdlambda_lj = 0;
 +        clear_mat(vir_q);
 +        clear_mat(vir_lj);
 +
 +        gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
 +                   chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
 +                   cr, maxshift_x, maxshift_y, mynrnb, wcycle,
 +                   vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
 +                   &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
 +                   pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 +
 +        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 +
 +        gmx_pme_send_force_vir_ener(pme_pp,
 +                                    f_pp, vir_q, energy_q, vir_lj, energy_lj,
 +                                    dvdlambda_q, dvdlambda_lj, cycles);
 +
 +        count++;
 +    } /***** end of quasi-loop, we stop with the break above */
 +    while (TRUE);
 +
 +    walltime_accounting_end(walltime_accounting);
 +
 +    return 0;
 +}
 +
 +static void
 +calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
 +{
 +    int  i;
 +
 +    for (i = 0; i < pme->atc[0].n; ++i)
 +    {
 +        real sigma4;
 +
 +        sigma4                     = local_sigma[i];
 +        sigma4                     = sigma4*sigma4;
 +        sigma4                     = sigma4*sigma4;
 +        pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
 +    }
 +}
 +
 +static void
 +calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
 +{
 +    int  i;
 +
 +    for (i = 0; i < pme->atc[0].n; ++i)
 +    {
 +        pme->atc[0].coefficient[i] *= local_sigma[i];
 +    }
 +}
 +
 +static void
 +do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
 +                     gmx_bool bFirst, rvec x[], real *data)
 +{
 +    int      d;
 +    pme_atomcomm_t *atc;
 +    atc = &pme->atc[0];
 +
 +    for (d = pme->ndecompdim - 1; d >= 0; d--)
 +    {
 +        int             n_d;
 +        rvec           *x_d;
 +        real           *param_d;
 +
 +        if (d == pme->ndecompdim - 1)
 +        {
 +            n_d     = homenr;
 +            x_d     = x + start;
 +            param_d = data;
 +        }
 +        else
 +        {
 +            n_d     = pme->atc[d + 1].n;
 +            x_d     = atc->x;
 +            param_d = atc->coefficient;
 +        }
 +        atc      = &pme->atc[d];
 +        atc->npd = n_d;
 +        if (atc->npd > atc->pd_nalloc)
 +        {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd, atc->pd_nalloc);
 +        }
 +        pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
 +        where();
 +        /* Redistribute x (only once) and qA/c6A or qB/c6B */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
 +        }
 +    }
 +}
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real *chargeA,   real *chargeB,
 +               real *c6A,       real *c6B,
 +               real *sigmaA,    real *sigmaB,
 +               matrix box, t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix vir_q,      real ewaldcoeff_q,
 +               matrix vir_lj,   real ewaldcoeff_lj,
 +               real *energy_q,  real *energy_lj,
 +               real lambda_q, real lambda_lj,
 +               real *dvdlambda_q, real *dvdlambda_lj,
 +               int flags)
 +{
 +    int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
 +    int     nx, ny, nz;
 +    int     n_d, local_ny;
 +    pme_atomcomm_t *atc = NULL;
 +    pmegrids_t *pmegrid = NULL;
 +    real    *grid       = NULL;
 +    real    *ptr;
 +    rvec    *x_d, *f_d;
 +    real    *coefficient = NULL;
 +    real    energy_AB[4];
 +    matrix  vir_AB[4];
 +    real    scale, lambda;
 +    gmx_bool bClearF;
 +    gmx_parallel_3dfft_t pfft_setup;
 +    real *  fftgrid;
 +    t_complex * cfftgrid;
 +    int     thread;
 +    gmx_bool bFirst, bDoSplines;
 +    int fep_state;
 +    int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
 +    const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
 +    const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 +
 +    assert(pme->nnodes > 0);
 +    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 +
 +    if (pme->nnodes > 1)
 +    {
 +        atc      = &pme->atc[0];
 +        atc->npd = homenr;
 +        if (atc->npd > atc->pd_nalloc)
 +        {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd, atc->pd_nalloc);
 +        }
 +        for (d = pme->ndecompdim-1; d >= 0; d--)
 +        {
 +            atc           = &pme->atc[d];
 +            atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 +        }
 +    }
 +    else
 +    {
 +        atc = &pme->atc[0];
 +        /* This could be necessary for TPI */
 +        pme->atc[0].n = homenr;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            pme_realloc_atomcomm_things(atc);
 +        }
 +        atc->x = x;
 +        atc->f = f;
 +    }
 +
 +    m_inv_ur0(box, pme->recipbox);
 +    bFirst = TRUE;
 +
 +    /* For simplicity, we construct the splines for all particles if
 +     * more than one PME calculations is needed. Some optimization
 +     * could be done by keeping track of which atoms have splines
 +     * constructed, and construct new splines on each pass for atoms
 +     * that don't yet have them.
 +     */
 +
 +    bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
 +
 +    /* We need a maximum of four separate PME calculations:
 +     * grid_index=0: Coulomb PME with charges from state A
 +     * grid_index=1: Coulomb PME with charges from state B
 +     * grid_index=2: LJ PME with C6 from state A
 +     * grid_index=3: LJ PME with C6 from state B
 +     * For Lorentz-Berthelot combination rules, a separate loop is used to
 +     * calculate all the terms
 +     */
 +
 +    /* If we are doing LJ-PME with LB, we only do Q here */
 +    max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
 +
 +    for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
 +    {
 +        /* Check if we should do calculations at this grid_index
 +         * If grid_index is odd we should be doing FEP
 +         * If grid_index < 2 we should be doing electrostatic PME
 +         * If grid_index >= 2 we should be doing LJ-PME
 +         */
 +        if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
 +                                    (grid_index == 1 && !pme->bFEP_q))) ||
 +            (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
 +                                    (grid_index == 3 && !pme->bFEP_lj))))
 +        {
 +            continue;
 +        }
 +        /* Unpack structure */
 +        pmegrid    = &pme->pmegrid[grid_index];
 +        fftgrid    = pme->fftgrid[grid_index];
 +        cfftgrid   = pme->cfftgrid[grid_index];
 +        pfft_setup = pme->pfft_setup[grid_index];
 +        switch (grid_index)
 +        {
 +            case 0: coefficient = chargeA + start; break;
 +            case 1: coefficient = chargeB + start; break;
 +            case 2: coefficient = c6A + start; break;
 +            case 3: coefficient = c6B + start; break;
 +        }
 +
 +        grid = pmegrid->grid.grid;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
 +                    cr->nnodes, cr->nodeid);
 +            fprintf(debug, "Grid = %p\n", (void*)grid);
 +            if (grid == NULL)
 +            {
 +                gmx_fatal(FARGS, "No grid!");
 +            }
 +        }
 +        where();
 +
 +        if (pme->nnodes == 1)
 +        {
 +            atc->coefficient = coefficient;
 +        }
 +        else
 +        {
 +            wallcycle_start(wcycle, ewcPME_REDISTXF);
 +            do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
 +            where();
 +
 +            wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
 +                    cr->nodeid, atc->n);
 +        }
 +
 +        if (flags & GMX_PME_SPREAD)
 +        {
 +            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +
 +            /* Spread the coefficients on a grid */
 +            spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
 +
 +            if (bFirst)
 +            {
 +                inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
 +            }
 +            inc_nrnb(nrnb, eNR_SPREADBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +
 +            if (!pme->bUseThreads)
 +            {
 +                wrap_periodic_pmegrid(pme, grid);
 +
 +                /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
 +                if (pme->nnodes > 1)
 +                {
 +                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
 +                    where();
 +                }
 +#endif
 +
 +                copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
 +            }
 +
 +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +
 +            /*
 +               dump_local_fftgrid(pme,fftgrid);
 +               exit(0);
 +             */
 +        }
 +
 +        /* Here we start a large thread parallel region */
 +#pragma omp parallel num_threads(pme->nthread) private(thread)
 +        {
 +            thread = gmx_omp_get_thread_num();
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                int loop_count;
 +
 +                /* do 3d-fft */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle, ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
 +                                           thread, wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_FFT);
 +                }
 +                where();
 +
 +                /* solve in k-space for our local cells */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
 +                }
 +                if (grid_index < DO_Q)
 +                {
 +                    loop_count =
 +                        solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
 +                                      box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                      bCalcEnerVir,
 +                                      pme->nthread, thread);
 +                }
 +                else
 +                {
 +                    loop_count =
 +                        solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
 +                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                         bCalcEnerVir,
 +                                         pme->nthread, thread);
 +                }
 +
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
 +                    where();
 +                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
 +                }
 +            }
 +
 +            if (bCalcF)
 +            {
 +                /* do 3d-invfft */
 +                if (thread == 0)
 +                {
 +                    where();
 +                    wallcycle_start(wcycle, ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
 +                                           thread, wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_FFT);
 +
 +                    where();
 +
 +                    if (pme->nodeid == 0)
 +                    {
 +                        ntot  = pme->nkx*pme->nky*pme->nkz;
 +                        npme  = ntot*log((real)ntot)/log(2.0);
 +                        inc_nrnb(nrnb, eNR_FFT, 2*npme);
 +                    }
 +
 +                    /* Note: this wallcycle region is closed below
 +                       outside an OpenMP region, so take care if
 +                       refactoring code here. */
 +                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +                }
 +
 +                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
 +            }
 +        }
 +        /* End of thread parallel section.
 +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
 +         */
 +
 +        if (bCalcF)
 +        {
 +            /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +            if (pme->nnodes > 1)
 +            {
 +                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
 +            }
 +#endif
 +            where();
 +
 +            unwrap_periodic_pmegrid(pme, grid);
 +
 +            /* interpolate forces for our local atoms */
 +
 +            where();
 +
 +            /* If we are running without parallelization,
 +             * atc->f is the actual force array, not a buffer,
 +             * therefore we should not clear it.
 +             */
 +            lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
 +            bClearF = (bFirst && PAR(cr));
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +            for (thread = 0; thread < pme->nthread; thread++)
 +            {
 +                gather_f_bsplines(pme, grid, bClearF, atc,
 +                                  &atc->spline[thread],
 +                                  pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
 +            }
 +
 +            where();
 +
 +            inc_nrnb(nrnb, eNR_GATHERFBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +            /* Note: this wallcycle region is opened above inside an OpenMP
 +               region, so take care if refactoring code here. */
 +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +        }
 +
 +        if (bCalcEnerVir)
 +        {
 +            /* This should only be called on the master thread
 +             * and after the threads have synchronized.
 +             */
 +            if (grid_index < 2)
 +            {
 +                get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
 +            }
 +            else
 +            {
 +                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
 +            }
 +        }
 +        bFirst = FALSE;
 +    } /* of grid_index-loop */
 +
 +    /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
 +     * seven terms. */
 +
 +    if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
 +    {
 +        /* Loop over A- and B-state if we are doing FEP */
 +        for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
 +        {
 +            real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
 +            if (pme->nnodes == 1)
 +            {
 +                if (pme->lb_buf1 == NULL)
 +                {
 +                    pme->lb_buf_nalloc = pme->atc[0].n;
 +                    snew(pme->lb_buf1, pme->lb_buf_nalloc);
 +                }
 +                pme->atc[0].coefficient = pme->lb_buf1;
 +                switch (fep_state)
 +                {
 +                    case 0:
 +                        local_c6      = c6A;
 +                        local_sigma   = sigmaA;
 +                        break;
 +                    case 1:
 +                        local_c6      = c6B;
 +                        local_sigma   = sigmaB;
 +                        break;
 +                    default:
 +                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
 +                }
 +            }
 +            else
 +            {
 +                atc = &pme->atc[0];
 +                switch (fep_state)
 +                {
 +                    case 0:
 +                        RedistC6      = c6A;
 +                        RedistSigma   = sigmaA;
 +                        break;
 +                    case 1:
 +                        RedistC6      = c6B;
 +                        RedistSigma   = sigmaB;
 +                        break;
 +                    default:
 +                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
 +                }
 +                wallcycle_start(wcycle, ewcPME_REDISTXF);
 +
 +                do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
 +                if (pme->lb_buf_nalloc < atc->n)
 +                {
 +                    pme->lb_buf_nalloc = atc->nalloc;
 +                    srenew(pme->lb_buf1, pme->lb_buf_nalloc);
 +                    srenew(pme->lb_buf2, pme->lb_buf_nalloc);
 +                }
 +                local_c6 = pme->lb_buf1;
 +                for (i = 0; i < atc->n; ++i)
 +                {
 +                    local_c6[i] = atc->coefficient[i];
 +                }
 +                where();
 +
 +                do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
 +                local_sigma = pme->lb_buf2;
 +                for (i = 0; i < atc->n; ++i)
 +                {
 +                    local_sigma[i] = atc->coefficient[i];
 +                }
 +                where();
 +
 +                wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +            }
 +            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
 +
 +            /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
 +            for (grid_index = 2; grid_index < 9; ++grid_index)
 +            {
 +                /* Unpack structure */
 +                pmegrid    = &pme->pmegrid[grid_index];
 +                fftgrid    = pme->fftgrid[grid_index];
 +                cfftgrid   = pme->cfftgrid[grid_index];
 +                pfft_setup = pme->pfft_setup[grid_index];
 +                calc_next_lb_coeffs(pme, local_sigma);
 +                grid = pmegrid->grid.grid;
 +                where();
 +
 +                if (flags & GMX_PME_SPREAD)
 +                {
 +                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +                    /* Spread the c6 on a grid */
 +                    spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
 +
 +                    if (bFirst)
 +                    {
 +                        inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
 +                    }
 +
 +                    inc_nrnb(nrnb, eNR_SPREADBSP,
 +                             pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +                    if (pme->nthread == 1)
 +                    {
 +                        wrap_periodic_pmegrid(pme, grid);
 +                        /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
 +                        if (pme->nnodes > 1)
 +                        {
 +                            gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
 +                            where();
 +                        }
 +#endif
 +                        copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
 +                    }
 +                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +                }
 +                /*Here we start a large thread parallel region*/
 +#pragma omp parallel num_threads(pme->nthread) private(thread)
 +                {
 +                    thread = gmx_omp_get_thread_num();
 +                    if (flags & GMX_PME_SOLVE)
 +                    {
 +                        /* do 3d-fft */
 +                        if (thread == 0)
 +                        {
 +                            wallcycle_start(wcycle, ewcPME_FFT);
 +                        }
 +
 +                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
 +                                                   thread, wcycle);
 +                        if (thread == 0)
 +                        {
 +                            wallcycle_stop(wcycle, ewcPME_FFT);
 +                        }
 +                        where();
 +                    }
 +                }
 +                bFirst = FALSE;
 +            }
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                /* solve in k-space for our local cells */
 +#pragma omp parallel num_threads(pme->nthread) private(thread)
 +                {
 +                    int loop_count;
 +                    thread = gmx_omp_get_thread_num();
 +                    if (thread == 0)
 +                    {
 +                        wallcycle_start(wcycle, ewcLJPME);
 +                    }
 +
 +                    loop_count =
 +                        solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
 +                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                         bCalcEnerVir,
 +                                         pme->nthread, thread);
 +                    if (thread == 0)
 +                    {
 +                        wallcycle_stop(wcycle, ewcLJPME);
 +                        where();
 +                        inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
 +                    }
 +                }
 +            }
 +
 +            if (bCalcEnerVir)
 +            {
 +                /* This should only be called on the master thread and
 +                 * after the threads have synchronized.
 +                 */
 +                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
 +            }
 +
 +            if (bCalcF)
 +            {
 +                bFirst = !(flags & GMX_PME_DO_COULOMB);
 +                calc_initial_lb_coeffs(pme, local_c6, local_sigma);
 +                for (grid_index = 8; grid_index >= 2; --grid_index)
 +                {
 +                    /* Unpack structure */
 +                    pmegrid    = &pme->pmegrid[grid_index];
 +                    fftgrid    = pme->fftgrid[grid_index];
 +                    cfftgrid   = pme->cfftgrid[grid_index];
 +                    pfft_setup = pme->pfft_setup[grid_index];
 +                    grid       = pmegrid->grid.grid;
 +                    calc_next_lb_coeffs(pme, local_sigma);
 +                    where();
 +#pragma omp parallel num_threads(pme->nthread) private(thread)
 +                    {
 +                        thread = gmx_omp_get_thread_num();
 +                        /* do 3d-invfft */
 +                        if (thread == 0)
 +                        {
 +                            where();
 +                            wallcycle_start(wcycle, ewcPME_FFT);
 +                        }
 +
 +                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
 +                                                   thread, wcycle);
 +                        if (thread == 0)
 +                        {
 +                            wallcycle_stop(wcycle, ewcPME_FFT);
 +
 +                            where();
 +
 +                            if (pme->nodeid == 0)
 +                            {
 +                                ntot  = pme->nkx*pme->nky*pme->nkz;
 +                                npme  = ntot*log((real)ntot)/log(2.0);
 +                                inc_nrnb(nrnb, eNR_FFT, 2*npme);
 +                            }
 +                            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +                        }
 +
 +                        copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
 +
 +                    } /*#pragma omp parallel*/
 +
 +                    /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +                    if (pme->nnodes > 1)
 +                    {
 +                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
 +                    }
 +#endif
 +                    where();
 +
 +                    unwrap_periodic_pmegrid(pme, grid);
 +
 +                    /* interpolate forces for our local atoms */
 +                    where();
 +                    bClearF = (bFirst && PAR(cr));
 +                    scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
 +                    scale  *= lb_scale_factor[grid_index-2];
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +                    for (thread = 0; thread < pme->nthread; thread++)
 +                    {
 +                        gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
 +                                          &pme->atc[0].spline[thread],
 +                                          scale);
 +                    }
 +                    where();
 +
 +                    inc_nrnb(nrnb, eNR_GATHERFBSP,
 +                             pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +
 +                    bFirst = FALSE;
 +                } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
 +            }     /* if (bCalcF) */
 +        }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
 +    }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
 +
 +    if (bCalcF && pme->nnodes > 1)
 +    {
 +        wallcycle_start(wcycle, ewcPME_REDISTXF);
 +        for (d = 0; d < pme->ndecompdim; d++)
 +        {
 +            atc = &pme->atc[d];
 +            if (d == pme->ndecompdim - 1)
 +            {
 +                n_d = homenr;
 +                f_d = f + start;
 +            }
 +            else
 +            {
 +                n_d = pme->atc[d+1].n;
 +                f_d = pme->atc[d+1].f;
 +            }
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_pmeredist_f(pme, atc, n_d, f_d,
 +                               d == pme->ndecompdim-1 && pme->bPPnode);
 +            }
 +        }
 +
 +        wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +    }
 +    where();
 +
 +    if (bCalcEnerVir)
 +    {
 +        if (flags & GMX_PME_DO_COULOMB)
 +        {
 +            if (!pme->bFEP_q)
 +            {
 +                *energy_q = energy_AB[0];
 +                m_add(vir_q, vir_AB[0], vir_q);
 +            }
 +            else
 +            {
 +                *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
 +                *dvdlambda_q   += energy_AB[1] - energy_AB[0];
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    for (j = 0; j < DIM; j++)
 +                    {
 +                        vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
 +                            lambda_q*vir_AB[1][i][j];
 +                    }
 +                }
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
 +            }
 +        }
 +        else
 +        {
 +            *energy_q = 0;
 +        }
 +
 +        if (flags & GMX_PME_DO_LJ)
 +        {
 +            if (!pme->bFEP_lj)
 +            {
 +                *energy_lj = energy_AB[2];
 +                m_add(vir_lj, vir_AB[2], vir_lj);
 +            }
 +            else
 +            {
 +                *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
 +                *dvdlambda_lj += energy_AB[3] - energy_AB[2];
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    for (j = 0; j < DIM; j++)
 +                    {
 +                        vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
 +                    }
 +                }
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
 +            }
 +        }
 +        else
 +        {
 +            *energy_lj = 0;
 +        }
 +    }
 +    return 0;
 +}
index 7ad217f4a5cae44f14ecd28358ec664185f51b3e,0000000000000000000000000000000000000000..c63d8aa756367a5ec7b87c15b879b5a5ca570bfc
mode 100644,000000..100644
--- /dev/null
@@@ -1,2036 -1,0 +1,2060 @@@
-             if ((interaction_function[ftype].flags & IF_VSITE) &&
-                 ftype != F_VSITEN)
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include "typedefs.h"
 +#include "vsite.h"
 +#include "macros.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "nrnb.h"
 +#include "vec.h"
 +#include "network.h"
 +#include "mshift.h"
 +#include "pbc.h"
 +#include "domdec.h"
 +#include "mtop_util.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#include "gromacs/utility/gmxomp.h"
 +
 +/* Routines to send/recieve coordinates and force
 + * of constructing atoms.
 + */
 +
 +static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 +{
 +    if (pbc)
 +    {
 +        return pbc_dx_aiuc(pbc, xi, xj, dx);
 +    }
 +    else
 +    {
 +        rvec_sub(xi, xj, dx);
 +        return CENTRAL;
 +    }
 +}
 +
 +/* Vsite construction routines */
 +
 +static void constr_vsite2(rvec xi, rvec xj, rvec x, real a, t_pbc *pbc)
 +{
 +    real b;
 +    rvec dx;
 +
 +    b = 1.0-a;
 +    /* 1 flop */
 +
 +    if (pbc)
 +    {
 +        pbc_dx_aiuc(pbc, xj, xi, dx);
 +        x[XX] = xi[XX] + a*dx[XX];
 +        x[YY] = xi[YY] + a*dx[YY];
 +        x[ZZ] = xi[ZZ] + a*dx[ZZ];
 +    }
 +    else
 +    {
 +        x[XX] = b*xi[XX] + a*xj[XX];
 +        x[YY] = b*xi[YY] + a*xj[YY];
 +        x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 +        /* 9 Flops */
 +    }
 +
 +    /* TOTAL: 10 flops */
 +}
 +
 +static void constr_vsite3(rvec xi, rvec xj, rvec xk, rvec x, real a, real b,
 +                          t_pbc *pbc)
 +{
 +    real c;
 +    rvec dxj, dxk;
 +
 +    c = 1.0-a-b;
 +    /* 2 flops */
 +
 +    if (pbc)
 +    {
 +        pbc_dx_aiuc(pbc, xj, xi, dxj);
 +        pbc_dx_aiuc(pbc, xk, xi, dxk);
 +        x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 +        x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 +        x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 +    }
 +    else
 +    {
 +        x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 +        x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 +        x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 +        /* 15 Flops */
 +    }
 +
 +    /* TOTAL: 17 flops */
 +}
 +
 +static void constr_vsite3FD(rvec xi, rvec xj, rvec xk, rvec x, real a, real b,
 +                            t_pbc *pbc)
 +{
 +    rvec xij, xjk, temp;
 +    real c;
 +
 +    pbc_rvec_sub(pbc, xj, xi, xij);
 +    pbc_rvec_sub(pbc, xk, xj, xjk);
 +    /* 6 flops */
 +
 +    /* temp goes from i to a point on the line jk */
 +    temp[XX] = xij[XX] + a*xjk[XX];
 +    temp[YY] = xij[YY] + a*xjk[YY];
 +    temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 +    /* 6 flops */
 +
 +    c = b*gmx_invsqrt(iprod(temp, temp));
 +    /* 6 + 10 flops */
 +
 +    x[XX] = xi[XX] + c*temp[XX];
 +    x[YY] = xi[YY] + c*temp[YY];
 +    x[ZZ] = xi[ZZ] + c*temp[ZZ];
 +    /* 6 Flops */
 +
 +    /* TOTAL: 34 flops */
 +}
 +
 +static void constr_vsite3FAD(rvec xi, rvec xj, rvec xk, rvec x, real a, real b, t_pbc *pbc)
 +{
 +    rvec xij, xjk, xp;
 +    real a1, b1, c1, invdij;
 +
 +    pbc_rvec_sub(pbc, xj, xi, xij);
 +    pbc_rvec_sub(pbc, xk, xj, xjk);
 +    /* 6 flops */
 +
 +    invdij = gmx_invsqrt(iprod(xij, xij));
 +    c1     = invdij * invdij * iprod(xij, xjk);
 +    xp[XX] = xjk[XX] - c1*xij[XX];
 +    xp[YY] = xjk[YY] - c1*xij[YY];
 +    xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 +    a1     = a*invdij;
 +    b1     = b*gmx_invsqrt(iprod(xp, xp));
 +    /* 45 */
 +
 +    x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 +    x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 +    x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 +    /* 12 Flops */
 +
 +    /* TOTAL: 63 flops */
 +}
 +
 +static void constr_vsite3OUT(rvec xi, rvec xj, rvec xk, rvec x,
 +                             real a, real b, real c, t_pbc *pbc)
 +{
 +    rvec xij, xik, temp;
 +
 +    pbc_rvec_sub(pbc, xj, xi, xij);
 +    pbc_rvec_sub(pbc, xk, xi, xik);
 +    cprod(xij, xik, temp);
 +    /* 15 Flops */
 +
 +    x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 +    x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 +    x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 +    /* 18 Flops */
 +
 +    /* TOTAL: 33 flops */
 +}
 +
 +static void constr_vsite4FD(rvec xi, rvec xj, rvec xk, rvec xl, rvec x,
 +                            real a, real b, real c, t_pbc *pbc)
 +{
 +    rvec xij, xjk, xjl, temp;
 +    real d;
 +
 +    pbc_rvec_sub(pbc, xj, xi, xij);
 +    pbc_rvec_sub(pbc, xk, xj, xjk);
 +    pbc_rvec_sub(pbc, xl, xj, xjl);
 +    /* 9 flops */
 +
 +    /* temp goes from i to a point on the plane jkl */
 +    temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 +    temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 +    temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 +    /* 12 flops */
 +
 +    d = c*gmx_invsqrt(iprod(temp, temp));
 +    /* 6 + 10 flops */
 +
 +    x[XX] = xi[XX] + d*temp[XX];
 +    x[YY] = xi[YY] + d*temp[YY];
 +    x[ZZ] = xi[ZZ] + d*temp[ZZ];
 +    /* 6 Flops */
 +
 +    /* TOTAL: 43 flops */
 +}
 +
 +
 +static void constr_vsite4FDN(rvec xi, rvec xj, rvec xk, rvec xl, rvec x,
 +                             real a, real b, real c, t_pbc *pbc)
 +{
 +    rvec xij, xik, xil, ra, rb, rja, rjb, rm;
 +    real d;
 +
 +    pbc_rvec_sub(pbc, xj, xi, xij);
 +    pbc_rvec_sub(pbc, xk, xi, xik);
 +    pbc_rvec_sub(pbc, xl, xi, xil);
 +    /* 9 flops */
 +
 +    ra[XX] = a*xik[XX];
 +    ra[YY] = a*xik[YY];
 +    ra[ZZ] = a*xik[ZZ];
 +
 +    rb[XX] = b*xil[XX];
 +    rb[YY] = b*xil[YY];
 +    rb[ZZ] = b*xil[ZZ];
 +
 +    /* 6 flops */
 +
 +    rvec_sub(ra, xij, rja);
 +    rvec_sub(rb, xij, rjb);
 +    /* 6 flops */
 +
 +    cprod(rja, rjb, rm);
 +    /* 9 flops */
 +
 +    d = c*gmx_invsqrt(norm2(rm));
 +    /* 5+5+1 flops */
 +
 +    x[XX] = xi[XX] + d*rm[XX];
 +    x[YY] = xi[YY] + d*rm[YY];
 +    x[ZZ] = xi[ZZ] + d*rm[ZZ];
 +    /* 6 Flops */
 +
 +    /* TOTAL: 47 flops */
 +}
 +
 +
 +static int constr_vsiten(t_iatom *ia, t_iparams ip[],
 +                         rvec *x, t_pbc *pbc)
 +{
 +    rvec xs, x1, dx;
 +    dvec dsum;
 +    int  n3, av, ai, i;
 +    real a;
 +
 +    n3 = 3*ip[ia[0]].vsiten.n;
 +    av = ia[1];
 +    ai = ia[2];
 +    copy_rvec(x[ai], x1);
 +    clear_dvec(dsum);
 +    for (i = 3; i < n3; i += 3)
 +    {
 +        ai = ia[i+2];
 +        a  = ip[ia[i]].vsiten.a;
 +        if (pbc)
 +        {
 +            pbc_dx_aiuc(pbc, x[ai], x1, dx);
 +        }
 +        else
 +        {
 +            rvec_sub(x[ai], x1, dx);
 +        }
 +        dsum[XX] += a*dx[XX];
 +        dsum[YY] += a*dx[YY];
 +        dsum[ZZ] += a*dx[ZZ];
 +        /* 9 Flops */
 +    }
 +
 +    x[av][XX] = x1[XX] + dsum[XX];
 +    x[av][YY] = x1[YY] + dsum[YY];
 +    x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 +
 +    return n3;
 +}
 +
 +
 +void construct_vsites_thread(gmx_vsite_t *vsite,
 +                             rvec x[],
 +                             real dt, rvec *v,
 +                             t_iparams ip[], t_ilist ilist[],
 +                             t_pbc *pbc_null)
 +{
 +    gmx_bool   bPBCAll;
 +    rvec       xpbc, xv, vv, dx;
 +    real       a1, b1, c1, inv_dt;
 +    int        i, inc, ii, nra, nr, tp, ftype;
 +    t_iatom    avsite, ai, aj, ak, al, pbc_atom;
 +    t_iatom   *ia;
 +    t_pbc     *pbc_null2;
 +    int       *vsite_pbc, ishift;
 +    rvec       reftmp, vtmp, rtmp;
 +
 +    if (v != NULL)
 +    {
 +        inv_dt = 1.0/dt;
 +    }
 +    else
 +    {
 +        inv_dt = 1.0;
 +    }
 +
 +    bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
 +
 +    pbc_null2 = NULL;
 +    vsite_pbc = NULL;
 +    for (ftype = 0; (ftype < F_NRE); ftype++)
 +    {
 +        if ((interaction_function[ftype].flags & IF_VSITE) &&
 +            ilist[ftype].nr > 0)
 +        {
 +            nra    = interaction_function[ftype].nratoms;
 +            inc    = 1 + nra;
 +            nr     = ilist[ftype].nr;
 +            ia     = ilist[ftype].iatoms;
 +
 +            if (bPBCAll)
 +            {
 +                pbc_null2 = pbc_null;
 +            }
 +            else if (pbc_null != NULL)
 +            {
 +                vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
 +            }
 +
 +            for (i = 0; i < nr; )
 +            {
 +                tp   = ia[0];
 +
 +                /* The vsite and constructing atoms */
 +                avsite = ia[1];
 +                ai     = ia[2];
 +
 +                /* Constants for constructing vsites */
 +                a1   = ip[tp].vsite.a;
 +                /* Check what kind of pbc we need to use */
 +                if (bPBCAll)
 +                {
 +                    /* No charge groups, vsite follows its own pbc */
 +                    pbc_atom = avsite;
 +                    copy_rvec(x[avsite], xpbc);
 +                }
 +                else if (vsite_pbc != NULL)
 +                {
 +                    pbc_atom = vsite_pbc[i/(1+nra)];
 +                    if (pbc_atom > -2)
 +                    {
 +                        if (pbc_atom >= 0)
 +                        {
 +                            /* We need to copy the coordinates here,
 +                             * single for single atom cg's pbc_atom
 +                             * is the vsite itself.
 +                             */
 +                            copy_rvec(x[pbc_atom], xpbc);
 +                        }
 +                        pbc_null2 = pbc_null;
 +                    }
 +                    else
 +                    {
 +                        pbc_null2 = NULL;
 +                    }
 +                }
 +                else
 +                {
 +                    pbc_atom = -2;
 +                }
 +                /* Copy the old position */
 +                copy_rvec(x[avsite], xv);
 +
 +                /* Construct the vsite depending on type */
 +                switch (ftype)
 +                {
 +                    case F_VSITE2:
 +                        aj = ia[3];
 +                        constr_vsite2(x[ai], x[aj], x[avsite], a1, pbc_null2);
 +                        break;
 +                    case F_VSITE3:
 +                        aj = ia[3];
 +                        ak = ia[4];
 +                        b1 = ip[tp].vsite.b;
 +                        constr_vsite3(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 +                        break;
 +                    case F_VSITE3FD:
 +                        aj = ia[3];
 +                        ak = ia[4];
 +                        b1 = ip[tp].vsite.b;
 +                        constr_vsite3FD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 +                        break;
 +                    case F_VSITE3FAD:
 +                        aj = ia[3];
 +                        ak = ia[4];
 +                        b1 = ip[tp].vsite.b;
 +                        constr_vsite3FAD(x[ai], x[aj], x[ak], x[avsite], a1, b1, pbc_null2);
 +                        break;
 +                    case F_VSITE3OUT:
 +                        aj = ia[3];
 +                        ak = ia[4];
 +                        b1 = ip[tp].vsite.b;
 +                        c1 = ip[tp].vsite.c;
 +                        constr_vsite3OUT(x[ai], x[aj], x[ak], x[avsite], a1, b1, c1, pbc_null2);
 +                        break;
 +                    case F_VSITE4FD:
 +                        aj = ia[3];
 +                        ak = ia[4];
 +                        al = ia[5];
 +                        b1 = ip[tp].vsite.b;
 +                        c1 = ip[tp].vsite.c;
 +                        constr_vsite4FD(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 +                                        pbc_null2);
 +                        break;
 +                    case F_VSITE4FDN:
 +                        aj = ia[3];
 +                        ak = ia[4];
 +                        al = ia[5];
 +                        b1 = ip[tp].vsite.b;
 +                        c1 = ip[tp].vsite.c;
 +                        constr_vsite4FDN(x[ai], x[aj], x[ak], x[al], x[avsite], a1, b1, c1,
 +                                         pbc_null2);
 +                        break;
 +                    case F_VSITEN:
 +                        inc = constr_vsiten(ia, ip, x, pbc_null2);
 +                        break;
 +                    default:
 +                        gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 +                                  ftype, __FILE__, __LINE__);
 +                }
 +
 +                if (pbc_atom >= 0)
 +                {
 +                    /* Match the pbc of this vsite to the rest of its charge group */
 +                    ishift = pbc_dx_aiuc(pbc_null, x[avsite], xpbc, dx);
 +                    if (ishift != CENTRAL)
 +                    {
 +                        rvec_add(xpbc, dx, x[avsite]);
 +                    }
 +                }
 +                if (v != NULL)
 +                {
 +                    /* Calculate velocity of vsite... */
 +                    rvec_sub(x[avsite], xv, vv);
 +                    svmul(inv_dt, vv, v[avsite]);
 +                }
 +
 +                /* Increment loop variables */
 +                i  += inc;
 +                ia += inc;
 +            }
 +        }
 +    }
 +}
 +
 +void construct_vsites(gmx_vsite_t *vsite,
 +                      rvec x[],
 +                      real dt, rvec *v,
 +                      t_iparams ip[], t_ilist ilist[],
 +                      int ePBC, gmx_bool bMolPBC,
 +                      t_commrec *cr, matrix box)
 +{
 +    t_pbc     pbc, *pbc_null;
 +    gmx_bool  bDomDec;
 +    int       nthreads;
 +
 +    bDomDec = cr && DOMAINDECOMP(cr);
 +
 +    /* We only need to do pbc when we have inter-cg vsites */
 +    if (ePBC != epbcNONE && (bDomDec || bMolPBC) && vsite->n_intercg_vsite)
 +    {
 +        /* This is wasting some CPU time as we now do this multiple times
 +         * per MD step. But how often do we have vsites with full pbc?
 +         */
 +        pbc_null = set_pbc_dd(&pbc, ePBC, cr != NULL ? cr->dd : NULL, FALSE, box);
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +
 +    if (bDomDec)
 +    {
 +        dd_move_x_vsites(cr->dd, box, x);
 +    }
 +
 +    if (vsite->nthreads == 1)
 +    {
 +        construct_vsites_thread(vsite,
 +                                x, dt, v,
 +                                ip, ilist,
 +                                pbc_null);
 +    }
 +    else
 +    {
 +#pragma omp parallel num_threads(vsite->nthreads)
 +        {
 +            construct_vsites_thread(vsite,
 +                                    x, dt, v,
 +                                    ip, vsite->tdata[gmx_omp_get_thread_num()].ilist,
 +                                    pbc_null);
 +        }
 +        /* Now we can construct the vsites that might depend on other vsites */
 +        construct_vsites_thread(vsite,
 +                                x, dt, v,
 +                                ip, vsite->tdata[vsite->nthreads].ilist,
 +                                pbc_null);
 +    }
 +}
 +
 +static void spread_vsite2(t_iatom ia[], real a,
 +                          rvec x[], rvec f[], rvec fshift[],
 +                          t_pbc *pbc, t_graph *g)
 +{
 +    rvec    fi, fj, dx;
 +    t_iatom av, ai, aj;
 +    ivec    di;
 +    real    b;
 +    int     siv, sij;
 +
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +
 +    svmul(1-a, f[av], fi);
 +    svmul(  a, f[av], fj);
 +    /* 7 flop */
 +
 +    rvec_inc(f[ai], fi);
 +    rvec_inc(f[aj], fj);
 +    /* 6 Flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 +        siv = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 +        sij = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 +        sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 +    }
 +    else
 +    {
 +        siv = CENTRAL;
 +        sij = CENTRAL;
 +    }
 +
 +    if (fshift && (siv != CENTRAL || sij != CENTRAL))
 +    {
 +        rvec_inc(fshift[siv], f[av]);
 +        rvec_dec(fshift[CENTRAL], fi);
 +        rvec_dec(fshift[sij], fj);
 +    }
 +
 +    /* TOTAL: 13 flops */
 +}
 +
 +void construct_vsites_mtop(gmx_vsite_t *vsite,
 +                           gmx_mtop_t *mtop, rvec x[])
 +{
 +    int             as, mb, mol;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t  *molt;
 +
 +    as = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        for (mol = 0; mol < molb->nmol; mol++)
 +        {
 +            construct_vsites(vsite, x+as, 0.0, NULL,
 +                             mtop->ffparams.iparams, molt->ilist,
 +                             epbcNONE, TRUE, NULL, NULL);
 +            as += molt->atoms.nr;
 +        }
 +    }
 +}
 +
 +static void spread_vsite3(t_iatom ia[], real a, real b,
 +                          rvec x[], rvec f[], rvec fshift[],
 +                          t_pbc *pbc, t_graph *g)
 +{
 +    rvec    fi, fj, fk, dx;
 +    atom_id av, ai, aj, ak;
 +    ivec    di;
 +    int     siv, sij, sik;
 +
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +
 +    svmul(1-a-b, f[av], fi);
 +    svmul(    a, f[av], fj);
 +    svmul(    b, f[av], fk);
 +    /* 11 flops */
 +
 +    rvec_inc(f[ai], fi);
 +    rvec_inc(f[aj], fj);
 +    rvec_inc(f[ak], fk);
 +    /* 9 Flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ia[1]), di);
 +        siv = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), di);
 +        sij = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), di);
 +        sik = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        siv = pbc_dx_aiuc(pbc, x[ai], x[av], dx);
 +        sij = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 +        sik = pbc_dx_aiuc(pbc, x[ai], x[ak], dx);
 +    }
 +    else
 +    {
 +        siv = CENTRAL;
 +        sij = CENTRAL;
 +        sik = CENTRAL;
 +    }
 +
 +    if (fshift && (siv != CENTRAL || sij != CENTRAL || sik != CENTRAL))
 +    {
 +        rvec_inc(fshift[siv], f[av]);
 +        rvec_dec(fshift[CENTRAL], fi);
 +        rvec_dec(fshift[sij], fj);
 +        rvec_dec(fshift[sik], fk);
 +    }
 +
 +    /* TOTAL: 20 flops */
 +}
 +
 +static void spread_vsite3FD(t_iatom ia[], real a, real b,
 +                            rvec x[], rvec f[], rvec fshift[],
 +                            gmx_bool VirCorr, matrix dxdf,
 +                            t_pbc *pbc, t_graph *g)
 +{
 +    real    fx, fy, fz, c, invl, fproj, a1;
 +    rvec    xvi, xij, xjk, xix, fv, temp;
 +    t_iatom av, ai, aj, ak;
 +    int     svi, sji, skj, d;
 +    ivec    di;
 +
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +    copy_rvec(f[av], fv);
 +
 +    sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 +    skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 +    /* 6 flops */
 +
 +    /* xix goes from i to point x on the line jk */
 +    xix[XX] = xij[XX]+a*xjk[XX];
 +    xix[YY] = xij[YY]+a*xjk[YY];
 +    xix[ZZ] = xij[ZZ]+a*xjk[ZZ];
 +    /* 6 flops */
 +
 +    invl = gmx_invsqrt(iprod(xix, xix));
 +    c    = b*invl;
 +    /* 4 + ?10? flops */
 +
 +    fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 +
 +    temp[XX] = c*(fv[XX]-fproj*xix[XX]);
 +    temp[YY] = c*(fv[YY]-fproj*xix[YY]);
 +    temp[ZZ] = c*(fv[ZZ]-fproj*xix[ZZ]);
 +    /* 16 */
 +
 +    /* c is already calculated in constr_vsite3FD
 +       storing c somewhere will save 26 flops!     */
 +
 +    a1         = 1-a;
 +    f[ai][XX] += fv[XX] - temp[XX];
 +    f[ai][YY] += fv[YY] - temp[YY];
 +    f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 +    f[aj][XX] += a1*temp[XX];
 +    f[aj][YY] += a1*temp[YY];
 +    f[aj][ZZ] += a1*temp[ZZ];
 +    f[ak][XX] += a*temp[XX];
 +    f[ak][YY] += a*temp[YY];
 +    f[ak][ZZ] += a*temp[ZZ];
 +    /* 19 Flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 +        svi = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 +        sji = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 +        skj = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 +    }
 +    else
 +    {
 +        svi = CENTRAL;
 +    }
 +
 +    if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 +    {
 +        rvec_dec(fshift[svi], fv);
 +        fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 +        fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 +        fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 +        fshift[    sji][XX] += temp[XX];
 +        fshift[    sji][YY] += temp[YY];
 +        fshift[    sji][ZZ] += temp[ZZ];
 +        fshift[    skj][XX] += a*temp[XX];
 +        fshift[    skj][YY] += a*temp[YY];
 +        fshift[    skj][ZZ] += a*temp[ZZ];
 +    }
 +
 +    if (VirCorr)
 +    {
 +        /* When VirCorr=TRUE, the virial for the current forces is not
 +         * calculated from the redistributed forces. This means that
 +         * the effect of non-linear virtual site constructions on the virial
 +         * needs to be added separately. This contribution can be calculated
 +         * in many ways, but the simplest and cheapest way is to use
 +         * the first constructing atom ai as a reference position in space:
 +         * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 +         */
 +        rvec xiv;
 +        int  i, j;
 +
 +        pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (j = 0; j < DIM; j++)
 +            {
 +                /* As xix is a linear combination of j and k, use that here */
 +                dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 +            }
 +        }
 +    }
 +
 +    /* TOTAL: 61 flops */
 +}
 +
 +static void spread_vsite3FAD(t_iatom ia[], real a, real b,
 +                             rvec x[], rvec f[], rvec fshift[],
 +                             gmx_bool VirCorr, matrix dxdf,
 +                             t_pbc *pbc, t_graph *g)
 +{
 +    rvec    xvi, xij, xjk, xperp, Fpij, Fppp, fv, f1, f2, f3;
 +    real    a1, b1, c1, c2, invdij, invdij2, invdp, fproj;
 +    t_iatom av, ai, aj, ak;
 +    int     svi, sji, skj, d;
 +    ivec    di;
 +
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +    copy_rvec(f[ia[1]], fv);
 +
 +    sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 +    skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 +    /* 6 flops */
 +
 +    invdij    = gmx_invsqrt(iprod(xij, xij));
 +    invdij2   = invdij * invdij;
 +    c1        = iprod(xij, xjk) * invdij2;
 +    xperp[XX] = xjk[XX] - c1*xij[XX];
 +    xperp[YY] = xjk[YY] - c1*xij[YY];
 +    xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 +    /* xperp in plane ijk, perp. to ij */
 +    invdp = gmx_invsqrt(iprod(xperp, xperp));
 +    a1    = a*invdij;
 +    b1    = b*invdp;
 +    /* 45 flops */
 +
 +    /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 +       storing them somewhere will save 45 flops!     */
 +
 +    fproj = iprod(xij, fv)*invdij2;
 +    svmul(fproj,                      xij,  Fpij);    /* proj. f on xij */
 +    svmul(iprod(xperp, fv)*invdp*invdp, xperp, Fppp); /* proj. f on xperp */
 +    svmul(b1*fproj,                   xperp, f3);
 +    /* 23 flops */
 +
 +    rvec_sub(fv, Fpij, f1); /* f1 = f - Fpij */
 +    rvec_sub(f1, Fppp, f2); /* f2 = f - Fpij - Fppp */
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        f1[d] *= a1;
 +        f2[d] *= b1;
 +    }
 +    /* 12 flops */
 +
 +    c2         = 1+c1;
 +    f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
 +    f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
 +    f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
 +    f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
 +    f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
 +    f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
 +    f[ak][XX] +=                      f2[XX];
 +    f[ak][YY] +=                      f2[YY];
 +    f[ak][ZZ] +=                      f2[ZZ];
 +    /* 30 Flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 +        svi = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 +        sji = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 +        skj = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 +    }
 +    else
 +    {
 +        svi = CENTRAL;
 +    }
 +
 +    if (fshift && (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL))
 +    {
 +        rvec_dec(fshift[svi], fv);
 +        fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
 +        fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
 +        fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
 +        fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
 +        fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
 +        fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
 +        fshift[    skj][XX] +=                          f2[XX];
 +        fshift[    skj][YY] +=                          f2[YY];
 +        fshift[    skj][ZZ] +=                          f2[ZZ];
 +    }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i, j;
 +
 +        pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (j = 0; j < DIM; j++)
 +            {
 +                /* Note that xik=xij+xjk, so we have to add xij*f2 */
 +                dxdf[i][j] +=
 +                    -xiv[i]*fv[j]
 +                    + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
 +                    + xjk[i]*f2[j];
 +            }
 +        }
 +    }
 +
 +    /* TOTAL: 113 flops */
 +}
 +
 +static void spread_vsite3OUT(t_iatom ia[], real a, real b, real c,
 +                             rvec x[], rvec f[], rvec fshift[],
 +                             gmx_bool VirCorr, matrix dxdf,
 +                             t_pbc *pbc, t_graph *g)
 +{
 +    rvec    xvi, xij, xik, fv, fj, fk;
 +    real    cfx, cfy, cfz;
 +    atom_id av, ai, aj, ak;
 +    ivec    di;
 +    int     svi, sji, ski;
 +
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +
 +    sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 +    ski = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
 +    /* 6 Flops */
 +
 +    copy_rvec(f[av], fv);
 +
 +    cfx = c*fv[XX];
 +    cfy = c*fv[YY];
 +    cfz = c*fv[ZZ];
 +    /* 3 Flops */
 +
 +    fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
 +    fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
 +    fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
 +
 +    fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
 +    fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
 +    fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
 +    /* 30 Flops */
 +
 +    f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
 +    f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
 +    f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
 +    rvec_inc(f[aj], fj);
 +    rvec_inc(f[ak], fk);
 +    /* 15 Flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 +        svi = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 +        sji = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
 +        ski = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 +    }
 +    else
 +    {
 +        svi = CENTRAL;
 +    }
 +
 +    if (fshift && (svi != CENTRAL || sji != CENTRAL || ski != CENTRAL))
 +    {
 +        rvec_dec(fshift[svi], fv);
 +        fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
 +        fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
 +        fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
 +        rvec_inc(fshift[sji], fj);
 +        rvec_inc(fshift[ski], fk);
 +    }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i, j;
 +
 +        pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (j = 0; j < DIM; j++)
 +            {
 +                dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
 +            }
 +        }
 +    }
 +
 +    /* TOTAL: 54 flops */
 +}
 +
 +static void spread_vsite4FD(t_iatom ia[], real a, real b, real c,
 +                            rvec x[], rvec f[], rvec fshift[],
 +                            gmx_bool VirCorr, matrix dxdf,
 +                            t_pbc *pbc, t_graph *g)
 +{
 +    real    d, invl, fproj, a1;
 +    rvec    xvi, xij, xjk, xjl, xix, fv, temp;
 +    atom_id av, ai, aj, ak, al;
 +    ivec    di;
 +    int     svi, sji, skj, slj, m;
 +
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +    al = ia[5];
 +
 +    sji = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 +    skj = pbc_rvec_sub(pbc, x[ak], x[aj], xjk);
 +    slj = pbc_rvec_sub(pbc, x[al], x[aj], xjl);
 +    /* 9 flops */
 +
 +    /* xix goes from i to point x on the plane jkl */
 +    for (m = 0; m < DIM; m++)
 +    {
 +        xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
 +    }
 +    /* 12 flops */
 +
 +    invl = gmx_invsqrt(iprod(xix, xix));
 +    d    = c*invl;
 +    /* 4 + ?10? flops */
 +
 +    copy_rvec(f[av], fv);
 +
 +    fproj = iprod(xix, fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 +
 +    for (m = 0; m < DIM; m++)
 +    {
 +        temp[m] = d*(fv[m] - fproj*xix[m]);
 +    }
 +    /* 16 */
 +
 +    /* c is already calculated in constr_vsite3FD
 +       storing c somewhere will save 35 flops!     */
 +
 +    a1 = 1 - a - b;
 +    for (m = 0; m < DIM; m++)
 +    {
 +        f[ai][m] += fv[m] - temp[m];
 +        f[aj][m] += a1*temp[m];
 +        f[ak][m] += a*temp[m];
 +        f[al][m] += b*temp[m];
 +    }
 +    /* 26 Flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, ia[1]), SHIFT_IVEC(g, ai), di);
 +        svi = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 +        sji = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, aj), di);
 +        skj = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, aj), di);
 +        slj = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 +    }
 +    else
 +    {
 +        svi = CENTRAL;
 +    }
 +
 +    if (fshift &&
 +        (svi != CENTRAL || sji != CENTRAL || skj != CENTRAL || slj != CENTRAL))
 +    {
 +        rvec_dec(fshift[svi], fv);
 +        for (m = 0; m < DIM; m++)
 +        {
 +            fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
 +            fshift[    sji][m] += temp[m];
 +            fshift[    skj][m] += a*temp[m];
 +            fshift[    slj][m] += b*temp[m];
 +        }
 +    }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i, j;
 +
 +        pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (j = 0; j < DIM; j++)
 +            {
 +                dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 +            }
 +        }
 +    }
 +
 +    /* TOTAL: 77 flops */
 +}
 +
 +
 +static void spread_vsite4FDN(t_iatom ia[], real a, real b, real c,
 +                             rvec x[], rvec f[], rvec fshift[],
 +                             gmx_bool VirCorr, matrix dxdf,
 +                             t_pbc *pbc, t_graph *g)
 +{
 +    rvec xvi, xij, xik, xil, ra, rb, rja, rjb, rab, rm, rt;
 +    rvec fv, fj, fk, fl;
 +    real invrm, denom;
 +    real cfx, cfy, cfz;
 +    ivec di;
 +    int  av, ai, aj, ak, al;
 +    int  svi, sij, sik, sil;
 +
 +    /* DEBUG: check atom indices */
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +    al = ia[5];
 +
 +    copy_rvec(f[av], fv);
 +
 +    sij = pbc_rvec_sub(pbc, x[aj], x[ai], xij);
 +    sik = pbc_rvec_sub(pbc, x[ak], x[ai], xik);
 +    sil = pbc_rvec_sub(pbc, x[al], x[ai], xil);
 +    /* 9 flops */
 +
 +    ra[XX] = a*xik[XX];
 +    ra[YY] = a*xik[YY];
 +    ra[ZZ] = a*xik[ZZ];
 +
 +    rb[XX] = b*xil[XX];
 +    rb[YY] = b*xil[YY];
 +    rb[ZZ] = b*xil[ZZ];
 +
 +    /* 6 flops */
 +
 +    rvec_sub(ra, xij, rja);
 +    rvec_sub(rb, xij, rjb);
 +    rvec_sub(rb, ra, rab);
 +    /* 9 flops */
 +
 +    cprod(rja, rjb, rm);
 +    /* 9 flops */
 +
 +    invrm = gmx_invsqrt(norm2(rm));
 +    denom = invrm*invrm;
 +    /* 5+5+2 flops */
 +
 +    cfx = c*invrm*fv[XX];
 +    cfy = c*invrm*fv[YY];
 +    cfz = c*invrm*fv[ZZ];
 +    /* 6 Flops */
 +
 +    cprod(rm, rab, rt);
 +    /* 9 flops */
 +
 +    rt[XX] *= denom;
 +    rt[YY] *= denom;
 +    rt[ZZ] *= denom;
 +    /* 3flops */
 +
 +    fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
 +    fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
 +    fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
 +    /* 30 flops */
 +
 +    cprod(rjb, rm, rt);
 +    /* 9 flops */
 +
 +    rt[XX] *= denom*a;
 +    rt[YY] *= denom*a;
 +    rt[ZZ] *= denom*a;
 +    /* 3flops */
 +
 +    fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
 +    fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
 +    fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
 +    /* 36 flops */
 +
 +    cprod(rm, rja, rt);
 +    /* 9 flops */
 +
 +    rt[XX] *= denom*b;
 +    rt[YY] *= denom*b;
 +    rt[ZZ] *= denom*b;
 +    /* 3flops */
 +
 +    fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
 +    fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
 +    fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
 +    /* 36 flops */
 +
 +    f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
 +    f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
 +    f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
 +    rvec_inc(f[aj], fj);
 +    rvec_inc(f[ak], fk);
 +    rvec_inc(f[al], fl);
 +    /* 21 flops */
 +
 +    if (g)
 +    {
 +        ivec_sub(SHIFT_IVEC(g, av), SHIFT_IVEC(g, ai), di);
 +        svi = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, aj), SHIFT_IVEC(g, ai), di);
 +        sij = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, ai), di);
 +        sik = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g, al), SHIFT_IVEC(g, ai), di);
 +        sil = IVEC2IS(di);
 +    }
 +    else if (pbc)
 +    {
 +        svi = pbc_rvec_sub(pbc, x[av], x[ai], xvi);
 +    }
 +    else
 +    {
 +        svi = CENTRAL;
 +    }
 +
 +    if (fshift && (svi != CENTRAL || sij != CENTRAL || sik != CENTRAL || sil != CENTRAL))
 +    {
 +        rvec_dec(fshift[svi], fv);
 +        fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
 +        fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
 +        fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
 +        rvec_inc(fshift[sij], fj);
 +        rvec_inc(fshift[sik], fk);
 +        rvec_inc(fshift[sil], fl);
 +    }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i, j;
 +
 +        pbc_rvec_sub(pbc, x[av], x[ai], xiv);
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            for (j = 0; j < DIM; j++)
 +            {
 +                dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
 +            }
 +        }
 +    }
 +
 +    /* Total: 207 flops (Yuck!) */
 +}
 +
 +
 +static int spread_vsiten(t_iatom ia[], t_iparams ip[],
 +                         rvec x[], rvec f[], rvec fshift[],
 +                         t_pbc *pbc, t_graph *g)
 +{
 +    rvec xv, dx, fi;
 +    int  n3, av, i, ai;
 +    real a;
 +    ivec di;
 +    int  siv;
 +
 +    n3 = 3*ip[ia[0]].vsiten.n;
 +    av = ia[1];
 +    copy_rvec(x[av], xv);
 +
 +    for (i = 0; i < n3; i += 3)
 +    {
 +        ai = ia[i+2];
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, av), di);
 +            siv = IVEC2IS(di);
 +        }
 +        else if (pbc)
 +        {
 +            siv = pbc_dx_aiuc(pbc, x[ai], xv, dx);
 +        }
 +        else
 +        {
 +            siv = CENTRAL;
 +        }
 +        a = ip[ia[i]].vsiten.a;
 +        svmul(a, f[av], fi);
 +        rvec_inc(f[ai], fi);
 +        if (fshift && siv != CENTRAL)
 +        {
 +            rvec_inc(fshift[siv], fi);
 +            rvec_dec(fshift[CENTRAL], fi);
 +        }
 +        /* 6 Flops */
 +    }
 +
 +    return n3;
 +}
 +
 +
 +static int vsite_count(const t_ilist *ilist, int ftype)
 +{
 +    if (ftype == F_VSITEN)
 +    {
 +        return ilist[ftype].nr/3;
 +    }
 +    else
 +    {
 +        return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
 +    }
 +}
 +
 +static void spread_vsite_f_thread(gmx_vsite_t *vsite,
 +                                  rvec x[], rvec f[], rvec *fshift,
 +                                  gmx_bool VirCorr, matrix dxdf,
 +                                  t_iparams ip[], t_ilist ilist[],
 +                                  t_graph *g, t_pbc *pbc_null)
 +{
 +    gmx_bool   bPBCAll;
 +    real       a1, b1, c1;
 +    int        i, inc, m, nra, nr, tp, ftype;
 +    t_iatom   *ia;
 +    t_pbc     *pbc_null2;
 +    int       *vsite_pbc;
 +
 +    if (VirCorr)
 +    {
 +        clear_mat(dxdf);
 +    }
 +
 +    bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
 +
 +    /* this loop goes backwards to be able to build *
 +     * higher type vsites from lower types         */
 +    pbc_null2 = NULL;
 +    vsite_pbc = NULL;
 +    for (ftype = F_NRE-1; (ftype >= 0); ftype--)
 +    {
 +        if ((interaction_function[ftype].flags & IF_VSITE) &&
 +            ilist[ftype].nr > 0)
 +        {
 +            nra    = interaction_function[ftype].nratoms;
 +            inc    = 1 + nra;
 +            nr     = ilist[ftype].nr;
 +            ia     = ilist[ftype].iatoms;
 +
 +            if (bPBCAll)
 +            {
 +                pbc_null2 = pbc_null;
 +            }
 +            else if (pbc_null != NULL)
 +            {
 +                vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
 +            }
 +
 +            for (i = 0; i < nr; )
 +            {
 +                if (vsite_pbc != NULL)
 +                {
 +                    if (vsite_pbc[i/(1+nra)] > -2)
 +                    {
 +                        pbc_null2 = pbc_null;
 +                    }
 +                    else
 +                    {
 +                        pbc_null2 = NULL;
 +                    }
 +                }
 +
 +                tp   = ia[0];
 +
 +                /* Constants for constructing */
 +                a1   = ip[tp].vsite.a;
 +                /* Construct the vsite depending on type */
 +                switch (ftype)
 +                {
 +                    case F_VSITE2:
 +                        spread_vsite2(ia, a1, x, f, fshift, pbc_null2, g);
 +                        break;
 +                    case F_VSITE3:
 +                        b1 = ip[tp].vsite.b;
 +                        spread_vsite3(ia, a1, b1, x, f, fshift, pbc_null2, g);
 +                        break;
 +                    case F_VSITE3FD:
 +                        b1 = ip[tp].vsite.b;
 +                        spread_vsite3FD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
 +                        break;
 +                    case F_VSITE3FAD:
 +                        b1 = ip[tp].vsite.b;
 +                        spread_vsite3FAD(ia, a1, b1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
 +                        break;
 +                    case F_VSITE3OUT:
 +                        b1 = ip[tp].vsite.b;
 +                        c1 = ip[tp].vsite.c;
 +                        spread_vsite3OUT(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
 +                        break;
 +                    case F_VSITE4FD:
 +                        b1 = ip[tp].vsite.b;
 +                        c1 = ip[tp].vsite.c;
 +                        spread_vsite4FD(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
 +                        break;
 +                    case F_VSITE4FDN:
 +                        b1 = ip[tp].vsite.b;
 +                        c1 = ip[tp].vsite.c;
 +                        spread_vsite4FDN(ia, a1, b1, c1, x, f, fshift, VirCorr, dxdf, pbc_null2, g);
 +                        break;
 +                    case F_VSITEN:
 +                        inc = spread_vsiten(ia, ip, x, f, fshift, pbc_null2, g);
 +                        break;
 +                    default:
 +                        gmx_fatal(FARGS, "No such vsite type %d in %s, line %d",
 +                                  ftype, __FILE__, __LINE__);
 +                }
 +                clear_rvec(f[ia[1]]);
 +
 +                /* Increment loop variables */
 +                i  += inc;
 +                ia += inc;
 +            }
 +        }
 +    }
 +}
 +
 +void spread_vsite_f(gmx_vsite_t *vsite,
 +                    rvec x[], rvec f[], rvec *fshift,
 +                    gmx_bool VirCorr, matrix vir,
 +                    t_nrnb *nrnb, t_idef *idef,
 +                    int ePBC, gmx_bool bMolPBC, t_graph *g, matrix box,
 +                    t_commrec *cr)
 +{
 +    t_pbc pbc, *pbc_null;
 +    int   th;
 +
 +    /* We only need to do pbc when we have inter-cg vsites */
 +    if ((DOMAINDECOMP(cr) || bMolPBC) && vsite->n_intercg_vsite)
 +    {
 +        /* This is wasting some CPU time as we now do this multiple times
 +         * per MD step. But how often do we have vsites with full pbc?
 +         */
 +        pbc_null = set_pbc_dd(&pbc, ePBC, cr->dd, FALSE, box);
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_clear_f_vsites(cr->dd, f);
 +    }
 +
 +    if (vsite->nthreads == 1)
 +    {
 +        spread_vsite_f_thread(vsite,
 +                              x, f, fshift,
 +                              VirCorr, vsite->tdata[0].dxdf,
 +                              idef->iparams, idef->il,
 +                              g, pbc_null);
 +    }
 +    else
 +    {
 +        /* First spread the vsites that might depend on other vsites */
 +        spread_vsite_f_thread(vsite,
 +                              x, f, fshift,
 +                              VirCorr, vsite->tdata[vsite->nthreads].dxdf,
 +                              idef->iparams,
 +                              vsite->tdata[vsite->nthreads].ilist,
 +                              g, pbc_null);
 +
 +#pragma omp parallel num_threads(vsite->nthreads)
 +        {
 +            int   thread;
 +            rvec *fshift_t;
 +
 +            thread = gmx_omp_get_thread_num();
 +
 +            if (thread == 0 || fshift == NULL)
 +            {
 +                fshift_t = fshift;
 +            }
 +            else
 +            {
 +                int i;
 +
 +                fshift_t = vsite->tdata[thread].fshift;
 +
 +                for (i = 0; i < SHIFTS; i++)
 +                {
 +                    clear_rvec(fshift_t[i]);
 +                }
 +            }
 +
 +            spread_vsite_f_thread(vsite,
 +                                  x, f, fshift_t,
 +                                  VirCorr, vsite->tdata[thread].dxdf,
 +                                  idef->iparams,
 +                                  vsite->tdata[thread].ilist,
 +                                  g, pbc_null);
 +        }
 +
 +        if (fshift != NULL)
 +        {
 +            int i;
 +
 +            for (th = 1; th < vsite->nthreads; th++)
 +            {
 +                for (i = 0; i < SHIFTS; i++)
 +                {
 +                    rvec_inc(fshift[i], vsite->tdata[th].fshift[i]);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (VirCorr)
 +    {
 +        int i, j;
 +
 +        for (th = 0; th < (vsite->nthreads == 1 ? 1 : vsite->nthreads+1); th++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                for (j = 0; j < DIM; j++)
 +                {
 +                    vir[i][j] += -0.5*vsite->tdata[th].dxdf[i][j];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_move_f_vsites(cr->dd, f, fshift);
 +    }
 +
 +    inc_nrnb(nrnb, eNR_VSITE2,   vsite_count(idef->il, F_VSITE2));
 +    inc_nrnb(nrnb, eNR_VSITE3,   vsite_count(idef->il, F_VSITE3));
 +    inc_nrnb(nrnb, eNR_VSITE3FD, vsite_count(idef->il, F_VSITE3FD));
 +    inc_nrnb(nrnb, eNR_VSITE3FAD, vsite_count(idef->il, F_VSITE3FAD));
 +    inc_nrnb(nrnb, eNR_VSITE3OUT, vsite_count(idef->il, F_VSITE3OUT));
 +    inc_nrnb(nrnb, eNR_VSITE4FD, vsite_count(idef->il, F_VSITE4FD));
 +    inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(idef->il, F_VSITE4FDN));
 +    inc_nrnb(nrnb, eNR_VSITEN,   vsite_count(idef->il, F_VSITEN));
 +}
 +
 +static int *atom2cg(t_block *cgs)
 +{
 +    int *a2cg, cg, i;
 +
 +    snew(a2cg, cgs->index[cgs->nr]);
 +    for (cg = 0; cg < cgs->nr; cg++)
 +    {
 +        for (i = cgs->index[cg]; i < cgs->index[cg+1]; i++)
 +        {
 +            a2cg[i] = cg;
 +        }
 +    }
 +
 +    return a2cg;
 +}
 +
 +static int count_intercg_vsite(gmx_mtop_t *mtop,
 +                               gmx_bool   *bHaveChargeGroups)
 +{
 +    int             mb, mt, ftype, nral, i, cg, a;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t  *molt;
 +    int            *a2cg;
 +    t_ilist        *il;
 +    t_iatom        *ia;
 +    int             n_intercg_vsite;
 +
 +    *bHaveChargeGroups = FALSE;
 +
 +    n_intercg_vsite = 0;
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +
 +        if (molt->cgs.nr < molt->atoms.nr)
 +        {
 +            *bHaveChargeGroups = TRUE;
 +        }
 +
 +        a2cg = atom2cg(&molt->cgs);
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_VSITE)
 +            {
 +                nral = NRAL(ftype);
 +                il   = &molt->ilist[ftype];
 +                ia   = il->iatoms;
 +                for (i = 0; i < il->nr; i += 1+nral)
 +                {
 +                    cg = a2cg[ia[1+i]];
 +                    for (a = 1; a < nral; a++)
 +                    {
 +                        if (a2cg[ia[1+a]] != cg)
 +                        {
 +                            n_intercg_vsite += molb->nmol;
 +                            break;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        sfree(a2cg);
 +    }
 +
 +    return n_intercg_vsite;
 +}
 +
 +static int **get_vsite_pbc(t_iparams *iparams, t_ilist *ilist,
 +                           t_atom *atom, t_mdatoms *md,
 +                           t_block *cgs, int *a2cg)
 +{
 +    int      ftype, nral, i, j, vsi, vsite, cg_v, cg_c, a, nc3 = 0;
 +    t_ilist *il;
 +    t_iatom *ia;
 +    int    **vsite_pbc, *vsite_pbc_f;
 +    char    *pbc_set;
 +    gmx_bool bViteOnlyCG_and_FirstAtom;
 +
 +    /* Make an array that tells if the pbc of an atom is set */
 +    snew(pbc_set, cgs->index[cgs->nr]);
 +    /* PBC is set for all non vsites */
 +    for (a = 0; a < cgs->index[cgs->nr]; a++)
 +    {
 +        if ((atom && atom[a].ptype != eptVSite) ||
 +            (md   && md->ptype[a]  != eptVSite))
 +        {
 +            pbc_set[a] = 1;
 +        }
 +    }
 +
 +    snew(vsite_pbc, F_VSITEN-F_VSITE2+1);
 +
 +    for (ftype = 0; ftype < F_NRE; ftype++)
 +    {
 +        if (interaction_function[ftype].flags & IF_VSITE)
 +        {
 +            nral = NRAL(ftype);
 +            il   = &ilist[ftype];
 +            ia   = il->iatoms;
 +
 +            snew(vsite_pbc[ftype-F_VSITE2], il->nr/(1+nral));
 +            vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
 +
 +            i = 0;
 +            while (i < il->nr)
 +            {
 +                vsi   = i/(1+nral);
 +                vsite = ia[i+1];
 +                cg_v  = a2cg[vsite];
 +                /* A value of -2 signals that this vsite and its contructing
 +                 * atoms are all within the same cg, so no pbc is required.
 +                 */
 +                vsite_pbc_f[vsi] = -2;
 +                /* Check if constructing atoms are outside the vsite's cg */
 +                nc3 = 0;
 +                if (ftype == F_VSITEN)
 +                {
 +                    nc3 = 3*iparams[ia[i]].vsiten.n;
 +                    for (j = 0; j < nc3; j += 3)
 +                    {
 +                        if (a2cg[ia[i+j+2]] != cg_v)
 +                        {
 +                            vsite_pbc_f[vsi] = -1;
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    for (a = 1; a < nral; a++)
 +                    {
 +                        if (a2cg[ia[i+1+a]] != cg_v)
 +                        {
 +                            vsite_pbc_f[vsi] = -1;
 +                        }
 +                    }
 +                }
 +                if (vsite_pbc_f[vsi] == -1)
 +                {
 +                    /* Check if this is the first processed atom of a vsite only cg */
 +                    bViteOnlyCG_and_FirstAtom = TRUE;
 +                    for (a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
 +                    {
 +                        /* Non-vsites already have pbc set, so simply check for pbc_set */
 +                        if (pbc_set[a])
 +                        {
 +                            bViteOnlyCG_and_FirstAtom = FALSE;
 +                            break;
 +                        }
 +                    }
 +                    if (bViteOnlyCG_and_FirstAtom)
 +                    {
 +                        /* First processed atom of a vsite only charge group.
 +                         * The pbc of the input coordinates to construct_vsites
 +                         * should be preserved.
 +                         */
 +                        vsite_pbc_f[vsi] = vsite;
 +                    }
 +                    else if (cg_v != a2cg[ia[1+i+1]])
 +                    {
 +                        /* This vsite has a different charge group index
 +                         * than it's first constructing atom
 +                         * and the charge group has more than one atom,
 +                         * search for the first normal particle
 +                         * or vsite that already had its pbc defined.
 +                         * If nothing is found, use full pbc for this vsite.
 +                         */
 +                        for (a = cgs->index[cg_v]; a < cgs->index[cg_v+1]; a++)
 +                        {
 +                            if (a != vsite && pbc_set[a])
 +                            {
 +                                vsite_pbc_f[vsi] = a;
 +                                if (gmx_debug_at)
 +                                {
 +                                    fprintf(debug, "vsite %d match pbc with atom %d\n",
 +                                            vsite+1, a+1);
 +                                }
 +                                break;
 +                            }
 +                        }
 +                        if (gmx_debug_at)
 +                        {
 +                            fprintf(debug, "vsite atom %d  cg %d - %d pbc atom %d\n",
 +                                    vsite+1, cgs->index[cg_v]+1, cgs->index[cg_v+1],
 +                                    vsite_pbc_f[vsi]+1);
 +                        }
 +                    }
 +                }
 +                if (ftype == F_VSITEN)
 +                {
 +                    /* The other entries in vsite_pbc_f are not used for center vsites */
 +                    i += nc3;
 +                }
 +                else
 +                {
 +                    i += 1+nral;
 +                }
 +
 +                /* This vsite now has its pbc defined */
 +                pbc_set[vsite] = 1;
 +            }
 +        }
 +    }
 +
 +    sfree(pbc_set);
 +
 +    return vsite_pbc;
 +}
 +
 +
 +gmx_vsite_t *init_vsite(gmx_mtop_t *mtop, t_commrec *cr,
 +                        gmx_bool bSerial_NoPBC)
 +{
 +    int            nvsite, i;
 +    int           *a2cg, cg;
 +    gmx_vsite_t   *vsite;
 +    int            mt;
 +    gmx_moltype_t *molt;
 +    int            nthreads;
 +
 +    /* check if there are vsites */
 +    nvsite = 0;
 +    for (i = 0; i < F_NRE; i++)
 +    {
 +        if (interaction_function[i].flags & IF_VSITE)
 +        {
 +            nvsite += gmx_mtop_ftype_count(mtop, i);
 +        }
 +    }
 +
 +    if (nvsite == 0)
 +    {
 +        return NULL;
 +    }
 +
 +    snew(vsite, 1);
 +
 +    vsite->n_intercg_vsite = count_intercg_vsite(mtop,
 +                                                 &vsite->bHaveChargeGroups);
 +
 +    /* If we don't have charge groups, the vsite follows its own pbc */
 +    if (!bSerial_NoPBC &&
 +        vsite->bHaveChargeGroups &&
 +        vsite->n_intercg_vsite > 0 && DOMAINDECOMP(cr))
 +    {
 +        vsite->nvsite_pbc_molt = mtop->nmoltype;
 +        snew(vsite->vsite_pbc_molt, vsite->nvsite_pbc_molt);
 +        for (mt = 0; mt < mtop->nmoltype; mt++)
 +        {
 +            molt = &mtop->moltype[mt];
 +            /* Make an atom to charge group index */
 +            a2cg = atom2cg(&molt->cgs);
 +            vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop->ffparams.iparams,
 +                                                      molt->ilist,
 +                                                      molt->atoms.atom, NULL,
 +                                                      &molt->cgs, a2cg);
 +            sfree(a2cg);
 +        }
 +
 +        snew(vsite->vsite_pbc_loc_nalloc, F_VSITEN-F_VSITE2+1);
 +        snew(vsite->vsite_pbc_loc, F_VSITEN-F_VSITE2+1);
 +    }
 +
 +    if (bSerial_NoPBC)
 +    {
 +        vsite->nthreads = 1;
 +    }
 +    else
 +    {
 +        vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
 +    }
 +    if (!bSerial_NoPBC)
 +    {
 +        /* We need one extra thread data structure for the overlap vsites */
 +        snew(vsite->tdata, vsite->nthreads+1);
 +    }
 +
 +    vsite->th_ind        = NULL;
 +    vsite->th_ind_nalloc = 0;
 +
 +    return vsite;
 +}
 +
 +static void prepare_vsite_thread(const t_ilist      *ilist,
 +                                 gmx_vsite_thread_t *vsite_th)
 +{
 +    int ftype;
 +
 +    for (ftype = 0; ftype < F_NRE; ftype++)
 +    {
 +        if (interaction_function[ftype].flags & IF_VSITE)
 +        {
 +            if (ilist[ftype].nr > vsite_th->ilist[ftype].nalloc)
 +            {
 +                vsite_th->ilist[ftype].nalloc = over_alloc_large(ilist[ftype].nr);
 +                srenew(vsite_th->ilist[ftype].iatoms, vsite_th->ilist[ftype].nalloc);
 +            }
 +
 +            vsite_th->ilist[ftype].nr = 0;
 +        }
 +    }
 +}
 +
 +void split_vsites_over_threads(const t_ilist   *ilist,
++                               const t_iparams *ip,
 +                               const t_mdatoms *mdatoms,
 +                               gmx_bool         bLimitRange,
 +                               gmx_vsite_t     *vsite)
 +{
 +    int      th;
 +    int      vsite_atom_range, natperthread;
 +    int     *th_ind;
 +    int      ftype;
 +    t_iatom *iat;
 +    t_ilist *il_th;
 +    int      nral1, inc, i, j;
 +
 +    if (vsite->nthreads == 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +#pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
 +    for (th = 0; th < vsite->nthreads; th++)
 +    {
 +        prepare_vsite_thread(ilist, &vsite->tdata[th]);
 +    }
 +    /* Master threads does the (potential) overlap vsites */
 +    prepare_vsite_thread(ilist, &vsite->tdata[vsite->nthreads]);
 +
 +    /* The current way of distributing the vsites over threads in primitive.
 +     * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
 +     * without taking into account how the vsites are distributed.
 +     * Without domain decomposition we bLimitRange=TRUE and we at least
 +     * tighten the upper bound of the range (useful for common systems
 +     * such as a vsite-protein in 3-site water).
 +     */
 +    if (bLimitRange)
 +    {
 +        vsite_atom_range = -1;
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
-                 nral1 = 1 + NRAL(ftype);
-                 iat   = ilist[ftype].iatoms;
-                 for (i = 0; i < ilist[ftype].nr; i += nral1)
++            if (interaction_function[ftype].flags & IF_VSITE)
 +            {
-                     for (j = i+1; j < i+nral1; j++)
++                if (ftype != F_VSITEN)
 +                {
-                         vsite_atom_range = max(vsite_atom_range, iat[j]);
++                    nral1 = 1 + NRAL(ftype);
++                    iat   = ilist[ftype].iatoms;
++                    for (i = 0; i < ilist[ftype].nr; i += nral1)
 +                    {
-         if ((interaction_function[ftype].flags & IF_VSITE) &&
-             ftype != F_VSITEN)
++                        for (j = i + 1; j < i + nral1; j++)
++                        {
++                            vsite_atom_range = max(vsite_atom_range, iat[j]);
++                        }
++                    }
++                }
++                else
++                {
++                    int vs_ind_end;
++
++                    iat = ilist[ftype].iatoms;
++
++                    i = 0;
++                    while (i < ilist[ftype].nr)
++                    {
++                        /* The 3 below is from 1+NRAL(ftype)=3 */
++                        vs_ind_end = i + ip[iat[i]].vsiten.n*3;
++
++                        vsite_atom_range = max(vsite_atom_range, iat[i+1]);
++                        while (i < vs_ind_end)
++                        {
++                            vsite_atom_range = max(vsite_atom_range, iat[i+2]);
++                            i               += 3;
++                        }
 +                    }
 +                }
 +            }
 +        }
 +        vsite_atom_range++;
 +    }
 +    else
 +    {
 +        vsite_atom_range = mdatoms->homenr;
 +    }
 +    natperthread = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "virtual site thread dist: natoms %d, range %d, natperthread %d\n", mdatoms->nr, vsite_atom_range, natperthread);
 +    }
 +
 +    /* To simplify the vsite assignment, we make an index which tells us
 +     * to which thread particles, both non-vsites and vsites, are assigned.
 +     */
 +    if (mdatoms->nr > vsite->th_ind_nalloc)
 +    {
 +        vsite->th_ind_nalloc = over_alloc_large(mdatoms->nr);
 +        srenew(vsite->th_ind, vsite->th_ind_nalloc);
 +    }
 +    th_ind = vsite->th_ind;
 +    th     = 0;
 +    for (i = 0; i < mdatoms->nr; i++)
 +    {
 +        if (mdatoms->ptype[i] == eptVSite)
 +        {
 +            /* vsites are not assigned to a thread yet */
 +            th_ind[i] = -1;
 +        }
 +        else
 +        {
 +            /* assign non-vsite particles to thread th */
 +            th_ind[i] = th;
 +        }
 +        if (i == (th + 1)*natperthread && th < vsite->nthreads)
 +        {
 +            th++;
 +        }
 +    }
 +
 +    for (ftype = 0; ftype < F_NRE; ftype++)
 +    {
-                     for (j = i+2; j < i+nral1; j++)
++        if (interaction_function[ftype].flags & IF_VSITE)
 +        {
 +            nral1 = 1 + NRAL(ftype);
 +            inc   = nral1;
 +            iat   = ilist[ftype].iatoms;
 +            for (i = 0; i < ilist[ftype].nr; )
 +            {
 +                th = iat[1+i]/natperthread;
 +                /* We would like to assign this vsite the thread th,
 +                 * but it might depend on atoms outside the atom range of th
 +                 * or on another vsite not assigned to thread th.
 +                 */
 +                if (ftype != F_VSITEN)
 +                {
-                     inc = iat[i];
-                     for (j = i+2; j < i+inc; j += 3)
++                    for (j = i + 2; j < i + nral1; j++)
 +                    {
 +                        if (th_ind[iat[j]] != th)
 +                        {
 +                            /* Some constructing atoms are not assigned to
 +                             * thread th, move this vsite to a separate batch.
 +                             */
 +                            th = vsite->nthreads;
 +                        }
 +                    }
 +                }
 +                else
 +                {
-                 for (j = i; j < i+inc; j++)
++                    /* The 3 below is from 1+NRAL(ftype)=3 */
++                    inc = ip[iat[i]].vsiten.n*3;
++                    for (j = i + 2; j < i + inc; j += 3)
 +                    {
 +                        if (th_ind[iat[j]] != th)
 +                        {
 +                            th = vsite->nthreads;
 +                        }
 +                    }
 +                }
 +                /* Copy this vsite to the thread data struct of thread th */
 +                il_th = &vsite->tdata[th].ilist[ftype];
-         split_vsites_over_threads(top->idef.il, md, !DOMAINDECOMP(cr), vsite);
++                for (j = i; j < i + inc; j++)
 +                {
 +                    il_th->iatoms[il_th->nr++] = iat[j];
 +                }
 +                /* Update this vsite's thread index entry */
 +                th_ind[iat[1+i]] = th;
 +
 +                i += inc;
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_VSITE) &&
 +                ilist[ftype].nr > 0)
 +            {
 +                fprintf(debug, "%-20s thread dist:",
 +                        interaction_function[ftype].longname);
 +                for (th = 0; th < vsite->nthreads+1; th++)
 +                {
 +                    fprintf(debug, " %4d", vsite->tdata[th].ilist[ftype].nr);
 +                }
 +                fprintf(debug, "\n");
 +            }
 +        }
 +    }
 +}
 +
 +void set_vsite_top(gmx_vsite_t *vsite, gmx_localtop_t *top, t_mdatoms *md,
 +                   t_commrec *cr)
 +{
 +    int *a2cg;
 +
 +    if (vsite->n_intercg_vsite > 0)
 +    {
 +        if (vsite->bHaveChargeGroups)
 +        {
 +            /* Make an atom to charge group index */
 +            a2cg                 = atom2cg(&top->cgs);
 +            vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
 +                                                 top->idef.il, NULL, md,
 +                                                 &top->cgs, a2cg);
 +            sfree(a2cg);
 +        }
 +
 +    }
 +
 +    if (vsite->nthreads > 1)
 +    {
 +        if (vsite->bHaveChargeGroups)
 +        {
 +            gmx_fatal(FARGS, "The combination of threading, virtual sites and charge groups is not implemented");
 +        }
 +
++        split_vsites_over_threads(top->idef.il, top->idef.iparams,
++                                  md, !DOMAINDECOMP(cr), vsite);
 +    }
 +}