Merge branch 'release-4-6' into release-5-0
authorMark Abraham <mark.j.abraham@gmail.com>
Thu, 10 Apr 2014 16:40:29 +0000 (18:40 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Thu, 10 Apr 2014 16:43:20 +0000 (18:43 +0200)
Change-Id: Icd1eafab98512328523a3ea788626b3fb415b0a8

1  2 
src/gromacs/gmxana/gmx_tune_pme.c
src/gromacs/gmxlib/nonbonded/nonbonded.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/domdec_setup.c
src/gromacs/mdlib/ns.c

index 3dbda71d479041b6d544af01fcf82be779beb277,0000000000000000000000000000000000000000..e76d32518e287bfa7ab09d4ef908358f4429aa4b
mode 100644,000000..100644
--- /dev/null
@@@ -1,2513 -1,0 +1,2513 @@@
-                       *cmd_mdrun);
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <time.h>
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +#include "gromacs/commandline/pargs.h"
 +#include "typedefs.h"
 +#include "types/commrec.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "vec.h"
 +#include "copyrite.h"
 +#include "gromacs/fileio/tpxio.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "readinp.h"
 +#include "calcgrid.h"
 +#include "checkpoint.h"
 +#include "macros.h"
 +#include "gmx_ana.h"
 +#include "names.h"
 +#include "perf_est.h"
 +#include "inputrec.h"
 +#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/math/utilities.h"
 +
 +#include "gmx_fatal.h"
 +
 +/* Enum for situations that can occur during log file parsing, the
 + * corresponding string entries can be found in do_the_tests() in
 + * const char* ParseLog[] */
 +enum {
 +    eParselogOK,
 +    eParselogNotFound,
 +    eParselogNoPerfData,
 +    eParselogTerm,
 +    eParselogResetProblem,
 +    eParselogNoDDGrid,
 +    eParselogTPXVersion,
 +    eParselogNotParallel,
 +    eParselogLargePrimeFactor,
 +    eParselogFatal,
 +    eParselogNr
 +};
 +
 +
 +typedef struct
 +{
 +    int     nPMEnodes;    /* number of PME-only nodes used in this test */
 +    int     nx, ny, nz;   /* DD grid */
 +    int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
 +    double *Gcycles;      /* This can contain more than one value if doing multiple tests */
 +    double  Gcycles_Av;
 +    float  *ns_per_day;
 +    float   ns_per_day_Av;
 +    float  *PME_f_load;     /* PME mesh/force load average*/
 +    float   PME_f_load_Av;  /* Average average ;) ... */
 +    char   *mdrun_cmd_line; /* Mdrun command line used for this test */
 +} t_perf;
 +
 +
 +typedef struct
 +{
 +    int             nr_inputfiles;  /* The number of tpr and mdp input files */
 +    gmx_int64_t     orig_sim_steps; /* Number of steps to be done in the real simulation */
 +    gmx_int64_t     orig_init_step; /* Init step for the real simulation */
 +    real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 +    real           *rvdw;           /* The vdW radii */
 +    real           *rlist;          /* Neighbourlist cutoff radius */
 +    real           *rlistlong;
 +    int            *nkx, *nky, *nkz;
 +    real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 +} t_inputinfo;
 +
 +
 +static void sep_line(FILE *fp)
 +{
 +    fprintf(fp, "\n------------------------------------------------------------\n");
 +}
 +
 +
 +/* Wrapper for system calls */
 +static int gmx_system_call(char *command)
 +{
 +#ifdef GMX_NO_SYSTEM
 +    gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 +#else
 +    return ( system(command) );
 +#endif
 +}
 +
 +
 +/* Check if string starts with substring */
 +static gmx_bool str_starts(const char *string, const char *substring)
 +{
 +    return ( strncmp(string, substring, strlen(substring)) == 0);
 +}
 +
 +
 +static void cleandata(t_perf *perfdata, int test_nr)
 +{
 +    perfdata->Gcycles[test_nr]    = 0.0;
 +    perfdata->ns_per_day[test_nr] = 0.0;
 +    perfdata->PME_f_load[test_nr] = 0.0;
 +
 +    return;
 +}
 +
 +
 +static gmx_bool is_equal(real a, real b)
 +{
 +    real diff, eps = 1.0e-7;
 +
 +
 +    diff = a - b;
 +
 +    if (diff < 0.0)
 +    {
 +        diff = -diff;
 +    }
 +
 +    if (diff < eps)
 +    {
 +        return TRUE;
 +    }
 +    else
 +    {
 +        return FALSE;
 +    }
 +}
 +
 +
 +static void remove_if_exists(const char *fn)
 +{
 +    if (gmx_fexist(fn))
 +    {
 +        fprintf(stdout, "Deleting %s\n", fn);
 +        remove(fn);
 +    }
 +}
 +
 +
 +static void finalize(const char *fn_out)
 +{
 +    char  buf[STRLEN];
 +    FILE *fp;
 +
 +
 +    fp = fopen(fn_out, "r");
 +    fprintf(stdout, "\n\n");
 +
 +    while (fgets(buf, STRLEN-1, fp) != NULL)
 +    {
 +        fprintf(stdout, "%s", buf);
 +    }
 +    fclose(fp);
 +    fprintf(stdout, "\n\n");
 +}
 +
 +
 +enum {
 +    eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 +};
 +
 +static int parse_logfile(const char *logfile, const char *errfile,
 +                         t_perf *perfdata, int test_nr, int presteps, gmx_int64_t cpt_steps,
 +                         int nnodes)
 +{
 +    FILE           *fp;
 +    char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 +    const char      matchstrdd[]  = "Domain decomposition grid";
 +    const char      matchstrcr[]  = "resetting all time and cycle counters";
 +    const char      matchstrbal[] = "Average PME mesh/force load:";
 +    const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 +    const char      errSIG[]      = "signal, stopping at the next";
 +    int             iFound;
 +    float           dum1, dum2, dum3, dum4;
 +    int             ndum;
 +    int             npme;
 +    gmx_int64_t     resetsteps     = -1;
 +    gmx_bool        bFoundResetStr = FALSE;
 +    gmx_bool        bResetChecked  = FALSE;
 +
 +
 +    if (!gmx_fexist(logfile))
 +    {
 +        fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 +        cleandata(perfdata, test_nr);
 +        return eParselogNotFound;
 +    }
 +
 +    fp = fopen(logfile, "r");
 +    perfdata->PME_f_load[test_nr] = -1.0;
 +    perfdata->guessPME            = -1;
 +
 +    iFound = eFoundNothing;
 +    if (1 == nnodes)
 +    {
 +        iFound = eFoundDDStr; /* Skip some case statements */
 +    }
 +
 +    while (fgets(line, STRLEN, fp) != NULL)
 +    {
 +        /* Remove leading spaces */
 +        ltrim(line);
 +
 +        /* Check for TERM and INT signals from user: */
 +        if (strstr(line, errSIG) != NULL)
 +        {
 +            fclose(fp);
 +            cleandata(perfdata, test_nr);
 +            return eParselogTerm;
 +        }
 +
 +        /* Check whether cycle resetting  worked */
 +        if (presteps > 0 && !bFoundResetStr)
 +        {
 +            if (strstr(line, matchstrcr) != NULL)
 +            {
 +                sprintf(dumstring, "step %s", "%"GMX_SCNd64);
 +                sscanf(line, dumstring, &resetsteps);
 +                bFoundResetStr = TRUE;
 +                if (resetsteps == presteps+cpt_steps)
 +                {
 +                    bResetChecked = TRUE;
 +                }
 +                else
 +                {
 +                    sprintf(dumstring, "%"GMX_PRId64, resetsteps);
 +                    sprintf(dumstring2, "%"GMX_PRId64, presteps+cpt_steps);
 +                    fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 +                            "         though they were supposed to be reset at step %s!\n",
 +                            dumstring, dumstring2);
 +                }
 +            }
 +        }
 +
 +        /* Look for strings that appear in a certain order in the log file: */
 +        switch (iFound)
 +        {
 +            case eFoundNothing:
 +                /* Look for domain decomp grid and separate PME nodes: */
 +                if (str_starts(line, matchstrdd))
 +                {
 +                    sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 +                           &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 +                    if (perfdata->nPMEnodes == -1)
 +                    {
 +                        perfdata->guessPME = npme;
 +                    }
 +                    else if (perfdata->nPMEnodes != npme)
 +                    {
 +                        gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 +                    }
 +                    iFound = eFoundDDStr;
 +                }
 +                /* Catch a few errors that might have occured: */
 +                else if (str_starts(line, "There is no domain decomposition for"))
 +                {
 +                    fclose(fp);
 +                    return eParselogNoDDGrid;
 +                }
 +                else if (str_starts(line, "The number of nodes you selected"))
 +                {
 +                    fclose(fp);
 +                    return eParselogLargePrimeFactor;
 +                }
 +                else if (str_starts(line, "reading tpx file"))
 +                {
 +                    fclose(fp);
 +                    return eParselogTPXVersion;
 +                }
 +                else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 +                {
 +                    fclose(fp);
 +                    return eParselogNotParallel;
 +                }
 +                break;
 +            case eFoundDDStr:
 +                /* Look for PME mesh/force balance (not necessarily present, though) */
 +                if (str_starts(line, matchstrbal))
 +                {
 +                    sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 +                }
 +                /* Look for matchstring */
 +                if (str_starts(line, matchstring))
 +                {
 +                    iFound = eFoundAccountingStr;
 +                }
 +                break;
 +            case eFoundAccountingStr:
 +                /* Already found matchstring - look for cycle data */
 +                if (str_starts(line, "Total  "))
 +                {
 +                    sscanf(line, "Total %*f %lf", &(perfdata->Gcycles[test_nr]));
 +                    iFound = eFoundCycleStr;
 +                }
 +                break;
 +            case eFoundCycleStr:
 +                /* Already found cycle data - look for remaining performance info and return */
 +                if (str_starts(line, "Performance:"))
 +                {
 +                    ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 +                    /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 +                    perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 +                    fclose(fp);
 +                    if (bResetChecked || presteps == 0)
 +                    {
 +                        return eParselogOK;
 +                    }
 +                    else
 +                    {
 +                        return eParselogResetProblem;
 +                    }
 +                }
 +                break;
 +        }
 +    } /* while */
 +
 +    /* Close the log file */
 +    fclose(fp);
 +
 +    /* Check why there is no performance data in the log file.
 +     * Did a fatal errors occur? */
 +    if (gmx_fexist(errfile))
 +    {
 +        fp = fopen(errfile, "r");
 +        while (fgets(line, STRLEN, fp) != NULL)
 +        {
 +            if (str_starts(line, "Fatal error:") )
 +            {
 +                if (fgets(line, STRLEN, fp) != NULL)
 +                {
 +                    fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 +                            "%s\n", line);
 +                }
 +                fclose(fp);
 +                cleandata(perfdata, test_nr);
 +                return eParselogFatal;
 +            }
 +        }
 +        fclose(fp);
 +    }
 +    else
 +    {
 +        fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 +    }
 +
 +    /* Giving up ... we could not find out why there is no performance data in
 +     * the log file. */
 +    fprintf(stdout, "No performance data in log file.\n");
 +    cleandata(perfdata, test_nr);
 +
 +    return eParselogNoPerfData;
 +}
 +
 +
 +static gmx_bool analyze_data(
 +        FILE         *fp,
 +        const char   *fn,
 +        t_perf      **perfdata,
 +        int           nnodes,
 +        int           ntprs,
 +        int           ntests,
 +        int           nrepeats,
 +        t_inputinfo  *info,
 +        int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 +        int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 +{
 +    int      i, j, k;
 +    int      line  = 0, line_win = -1;
 +    int      k_win = -1, i_win = -1, winPME;
 +    double   s     = 0.0; /* standard deviation */
 +    t_perf  *pd;
 +    char     strbuf[STRLEN];
 +    char     str_PME_f_load[13];
 +    gmx_bool bCanUseOrigTPR;
 +    gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 +
 +
 +    if (nrepeats > 1)
 +    {
 +        sep_line(fp);
 +        fprintf(fp, "Summary of successful runs:\n");
 +        fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 +        if (nnodes > 1)
 +        {
 +            fprintf(fp, "    DD grid");
 +        }
 +        fprintf(fp, "\n");
 +    }
 +
 +
 +    for (k = 0; k < ntprs; k++)
 +    {
 +        for (i = 0; i < ntests; i++)
 +        {
 +            /* Select the right dataset: */
 +            pd = &(perfdata[k][i]);
 +
 +            pd->Gcycles_Av    = 0.0;
 +            pd->PME_f_load_Av = 0.0;
 +            pd->ns_per_day_Av = 0.0;
 +
 +            if (pd->nPMEnodes == -1)
 +            {
 +                sprintf(strbuf, "(%3d)", pd->guessPME);
 +            }
 +            else
 +            {
 +                sprintf(strbuf, "     ");
 +            }
 +
 +            /* Get the average run time of a setting */
 +            for (j = 0; j < nrepeats; j++)
 +            {
 +                pd->Gcycles_Av    += pd->Gcycles[j];
 +                pd->PME_f_load_Av += pd->PME_f_load[j];
 +            }
 +            pd->Gcycles_Av    /= nrepeats;
 +            pd->PME_f_load_Av /= nrepeats;
 +
 +            for (j = 0; j < nrepeats; j++)
 +            {
 +                if (pd->ns_per_day[j] > 0.0)
 +                {
 +                    pd->ns_per_day_Av += pd->ns_per_day[j];
 +                }
 +                else
 +                {
 +                    /* Somehow the performance number was not aquired for this run,
 +                     * therefor set the average to some negative value: */
 +                    pd->ns_per_day_Av = -1.0f*nrepeats;
 +                    break;
 +                }
 +            }
 +            pd->ns_per_day_Av /= nrepeats;
 +
 +            /* Nicer output: */
 +            if (pd->PME_f_load_Av > 0.0)
 +            {
 +                sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 +            }
 +            else
 +            {
 +                sprintf(str_PME_f_load, "%s", "         -  ");
 +            }
 +
 +
 +            /* We assume we had a successful run if both averages are positive */
 +            if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 +            {
 +                /* Output statistics if repeats were done */
 +                if (nrepeats > 1)
 +                {
 +                    /* Calculate the standard deviation */
 +                    s = 0.0;
 +                    for (j = 0; j < nrepeats; j++)
 +                    {
 +                        s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 +                    }
 +                    s /= (nrepeats - 1);
 +                    s  = sqrt(s);
 +
 +                    fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 +                            line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 +                            pd->ns_per_day_Av, str_PME_f_load);
 +                    if (nnodes > 1)
 +                    {
 +                        fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 +                    }
 +                    fprintf(fp, "\n");
 +                }
 +                /* Store the index of the best run found so far in 'winner': */
 +                if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 +                {
 +                    k_win    = k;
 +                    i_win    = i;
 +                    line_win = line;
 +                }
 +                line++;
 +            }
 +        }
 +    }
 +
 +    if (k_win == -1)
 +    {
 +        gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 +    }
 +
 +    sep_line(fp);
 +
 +    winPME = perfdata[k_win][i_win].nPMEnodes;
 +
 +    if (1 == ntests)
 +    {
 +        /* We stuck to a fixed number of PME-only nodes */
 +        sprintf(strbuf, "settings No. %d", k_win);
 +    }
 +    else
 +    {
 +        /* We have optimized the number of PME-only nodes */
 +        if (winPME == -1)
 +        {
 +            sprintf(strbuf, "%s", "the automatic number of PME nodes");
 +        }
 +        else
 +        {
 +            sprintf(strbuf, "%d PME nodes", winPME);
 +        }
 +    }
 +    fprintf(fp, "Best performance was achieved with %s", strbuf);
 +    if ((nrepeats > 1) && (ntests > 1))
 +    {
 +        fprintf(fp, " (see line %d)", line_win);
 +    }
 +    fprintf(fp, "\n");
 +
 +    /* Only mention settings if they were modified: */
 +    bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 +    bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 +    bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 +                     info->nky[k_win] == info->nky[0] &&
 +                     info->nkz[k_win] == info->nkz[0]);
 +
 +    if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 +    {
 +        fprintf(fp, "Optimized PME settings:\n");
 +        bCanUseOrigTPR = FALSE;
 +    }
 +    else
 +    {
 +        bCanUseOrigTPR = TRUE;
 +    }
 +
 +    if (bRefinedCoul)
 +    {
 +        fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 +    }
 +
 +    if (bRefinedVdW)
 +    {
 +        fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 +    }
 +
 +    if (bRefinedGrid)
 +    {
 +        fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 +                info->nkx[0], info->nky[0], info->nkz[0]);
 +    }
 +
 +    if (bCanUseOrigTPR && ntprs > 1)
 +    {
 +        fprintf(fp, "and original PME settings.\n");
 +    }
 +
 +    fflush(fp);
 +
 +    /* Return the index of the mdp file that showed the highest performance
 +     * and the optimal number of PME nodes */
 +    *index_tpr    = k_win;
 +    *npme_optimal = winPME;
 +
 +    return bCanUseOrigTPR;
 +}
 +
 +
 +/* Get the commands we need to set up the runs from environment variables */
 +static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char *cmd_mdrun[])
 +{
 +    char      *cp;
 +    FILE      *fp;
 +    const char def_mpirun[]   = "mpirun";
 +    const char def_mdrun[]    = "mdrun";
 +
 +    const char empty_mpirun[] = "";
 +
 +    /* Get the commands we need to set up the runs from environment variables */
 +    if (!bThreads)
 +    {
 +        if ( (cp = getenv("MPIRUN")) != NULL)
 +        {
 +            *cmd_mpirun = strdup(cp);
 +        }
 +        else
 +        {
 +            *cmd_mpirun = strdup(def_mpirun);
 +        }
 +    }
 +    else
 +    {
 +        *cmd_mpirun = strdup(empty_mpirun);
 +    }
 +
 +    if ( (cp = getenv("MDRUN" )) != NULL)
 +    {
 +        *cmd_mdrun  = strdup(cp);
 +    }
 +    else
 +    {
 +        *cmd_mdrun  = strdup(def_mdrun);
 +    }
 +}
 +
 +/* Check that the commands will run mdrun (perhaps via mpirun) by
 + * running a very quick test simulation. Requires MPI environment to
 + * be available if applicable. */
 +static void check_mdrun_works(gmx_bool    bThreads,
 +                              const char *cmd_mpirun,
 +                              const char *cmd_np,
 +                              const char *cmd_mdrun)
 +{
 +    char      *command = NULL;
 +    char      *cp;
 +    char       line[STRLEN];
 +    FILE      *fp;
 +    const char filename[]     = "benchtest.log";
 +
 +    /* This string should always be identical to the one in copyrite.c,
 +     * gmx_print_version_info() in the defined(GMX_MPI) section */
 +    const char match_mpi[]    = "MPI library:        MPI";
 +    const char match_mdrun[]  = "Executable: ";
 +    gmx_bool   bMdrun         = FALSE;
 +    gmx_bool   bMPI           = FALSE;
 +
 +    /* Run a small test to see whether mpirun + mdrun work  */
 +    fprintf(stdout, "Making sure that mdrun can be executed. ");
 +    if (bThreads)
 +    {
 +        snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 +        sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 +    }
 +    else
 +    {
 +        snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 +        sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 +    }
 +    fprintf(stdout, "Trying '%s' ... ", command);
 +    make_backup(filename);
 +    gmx_system_call(command);
 +
 +    /* Check if we find the characteristic string in the output: */
 +    if (!gmx_fexist(filename))
 +    {
 +        gmx_fatal(FARGS, "Output from test run could not be found.");
 +    }
 +
 +    fp = fopen(filename, "r");
 +    /* We need to scan the whole output file, since sometimes the queuing system
 +     * also writes stuff to stdout/err */
 +    while (!feof(fp) )
 +    {
 +        cp = fgets(line, STRLEN, fp);
 +        if (cp != NULL)
 +        {
 +            if (str_starts(line, match_mdrun) )
 +            {
 +                bMdrun = TRUE;
 +            }
 +            if (str_starts(line, match_mpi) )
 +            {
 +                bMPI = TRUE;
 +            }
 +        }
 +    }
 +    fclose(fp);
 +
 +    if (bThreads)
 +    {
 +        if (bMPI)
 +        {
 +            gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 +                      "(%s)\n"
 +                      "seems to have been compiled with MPI instead.",
-                       *cmd_mdrun);
++                      cmd_mdrun);
 +        }
 +    }
 +    else
 +    {
 +        if (bMdrun && !bMPI)
 +        {
 +            gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 +                      "(%s)\n"
 +                      "seems to have been compiled without MPI support.",
++                      cmd_mdrun);
 +        }
 +    }
 +
 +    if (!bMdrun)
 +    {
 +        gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 +                  filename);
 +    }
 +
 +    fprintf(stdout, "passed.\n");
 +
 +    /* Clean up ... */
 +    remove(filename);
 +    sfree(command);
 +}
 +
 +
 +static void launch_simulation(
 +        gmx_bool    bLaunch,        /* Should the simulation be launched? */
 +        FILE       *fp,             /* General log file */
 +        gmx_bool    bThreads,       /* whether to use threads */
 +        char       *cmd_mpirun,     /* Command for mpirun */
 +        char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 +        char       *cmd_mdrun,      /* Command for mdrun */
 +        char       *args_for_mdrun, /* Arguments for mdrun */
 +        const char *simulation_tpr, /* This tpr will be simulated */
 +        int         nPMEnodes)      /* Number of PME nodes to use */
 +{
 +    char  *command;
 +
 +
 +    /* Make enough space for the system call command,
 +     * (100 extra chars for -npme ... etc. options should suffice): */
 +    snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 +
 +    /* Note that the -passall options requires args_for_mdrun to be at the end
 +     * of the command line string */
 +    if (bThreads)
 +    {
 +        sprintf(command, "%s%s-npme %d -s %s %s",
 +                cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 +    }
 +    else
 +    {
 +        sprintf(command, "%s%s%s -npme %d -s %s %s",
 +                cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 +    }
 +
 +    fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 +    sep_line(fp);
 +    fflush(fp);
 +
 +    /* Now the real thing! */
 +    if (bLaunch)
 +    {
 +        fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 +        sep_line(stdout);
 +        fflush(stdout);
 +        gmx_system_call(command);
 +    }
 +}
 +
 +
 +static void modify_PMEsettings(
 +        gmx_int64_t     simsteps,    /* Set this value as number of time steps */
 +        gmx_int64_t     init_step,   /* Set this value as init_step */
 +        const char     *fn_best_tpr, /* tpr file with the best performance */
 +        const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 +{
 +    t_inputrec   *ir;
 +    t_state       state;
 +    gmx_mtop_t    mtop;
 +    char          buf[200];
 +
 +    snew(ir, 1);
 +    read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 +
 +    /* Reset nsteps and init_step to the value of the input .tpr file */
 +    ir->nsteps    = simsteps;
 +    ir->init_step = init_step;
 +
 +    /* Write the tpr file which will be launched */
 +    sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, "%"GMX_PRId64);
 +    fprintf(stdout, buf, ir->nsteps);
 +    fflush(stdout);
 +    write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 +
 +    sfree(ir);
 +}
 +
 +
 +#define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 +
 +/* Make additional TPR files with more computational load for the
 + * direct space processors: */
 +static void make_benchmark_tprs(
 +        const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 +        char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 +        gmx_int64_t     benchsteps,      /* Number of time steps for benchmark runs       */
 +        gmx_int64_t     statesteps,      /* Step counter in checkpoint file               */
 +        real            rmin,            /* Minimal Coulomb radius                        */
 +        real            rmax,            /* Maximal Coulomb radius                        */
 +        real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 +        int            *ntprs,           /* No. of TPRs to write, each with a different
 +                                            rcoulomb and fourierspacing                   */
 +        t_inputinfo    *info,            /* Contains information about mdp file options   */
 +        FILE           *fp)              /* Write the output here                         */
 +{
 +    int           i, j, d;
 +    t_inputrec   *ir;
 +    t_state       state;
 +    gmx_mtop_t    mtop;
 +    real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 +    char          buf[200];
 +    rvec          box_size;
 +    gmx_bool      bNote = FALSE;
 +    real          add;              /* Add this to rcoul for the next test    */
 +    real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 +    real          fourierspacing;   /* Basic fourierspacing from tpr          */
 +
 +
 +    sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 +            *ntprs > 1 ? "s" : "", "%"GMX_PRId64, benchsteps > 1 ? "s" : "");
 +    fprintf(stdout, buf, benchsteps);
 +    if (statesteps > 0)
 +    {
 +        sprintf(buf, " (adding %s steps from checkpoint file)", "%"GMX_PRId64);
 +        fprintf(stdout, buf, statesteps);
 +        benchsteps += statesteps;
 +    }
 +    fprintf(stdout, ".\n");
 +
 +
 +    snew(ir, 1);
 +    read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 +
 +    /* Check if some kind of PME was chosen */
 +    if (EEL_PME(ir->coulombtype) == FALSE)
 +    {
 +        gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 +                  EELTYPE(eelPME));
 +    }
 +
 +    /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 +    if (  (ir->cutoff_scheme != ecutsVERLET) &&
 +          (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 +    {
 +        gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 +                  EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 +    }
 +    /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 +    else if (ir->rcoulomb > ir->rlist)
 +    {
 +        gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 +                  EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 +    }
 +
 +    if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 +    {
 +        fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 +        bScaleRvdw = FALSE;
 +    }
 +
 +    /* Reduce the number of steps for the benchmarks */
 +    info->orig_sim_steps = ir->nsteps;
 +    ir->nsteps           = benchsteps;
 +    /* We must not use init_step from the input tpr file for the benchmarks */
 +    info->orig_init_step = ir->init_step;
 +    ir->init_step        = 0;
 +
 +    /* For PME-switch potentials, keep the radial distance of the buffer region */
 +    nlist_buffer   = ir->rlist - ir->rcoulomb;
 +
 +    /* Determine length of triclinic box vectors */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        box_size[d] = 0;
 +        for (i = 0; i < DIM; i++)
 +        {
 +            box_size[d] += state.box[d][i]*state.box[d][i];
 +        }
 +        box_size[d] = sqrt(box_size[d]);
 +    }
 +
 +    if (ir->fourier_spacing > 0)
 +    {
 +        info->fsx[0] = ir->fourier_spacing;
 +        info->fsy[0] = ir->fourier_spacing;
 +        info->fsz[0] = ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 +        info->fsx[0] = box_size[XX]/ir->nkx;
 +        info->fsy[0] = box_size[YY]/ir->nky;
 +        info->fsz[0] = box_size[ZZ]/ir->nkz;
 +    }
 +
 +    /* If no value for the fourierspacing was provided on the command line, we
 +     * use the reconstruction from the tpr file */
 +    if (ir->fourier_spacing > 0)
 +    {
 +        /* Use the spacing from the tpr */
 +        fourierspacing = ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        /* Use the maximum observed spacing */
 +        fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 +    }
 +
 +    fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 +
 +    /* For performance comparisons the number of particles is useful to have */
 +    fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 +
 +    /* Print information about settings of which some are potentially modified: */
 +    fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 +    fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 +            box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 +    fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 +    if (ir_vdw_switched(ir))
 +    {
 +        fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 +    }
 +    if (EPME_SWITCHED(ir->coulombtype))
 +    {
 +        fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 +    }
 +    if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 +    {
 +        fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 +    }
 +
 +    /* Print a descriptive line about the tpr settings tested */
 +    fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 +    fprintf(fp, " No.   scaling  rcoulomb");
 +    fprintf(fp, "  nkx  nky  nkz");
 +    fprintf(fp, "   spacing");
 +    if (evdwCUT == ir->vdwtype)
 +    {
 +        fprintf(fp, "      rvdw");
 +    }
 +    if (EPME_SWITCHED(ir->coulombtype))
 +    {
 +        fprintf(fp, "     rlist");
 +    }
 +    if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 +    {
 +        fprintf(fp, " rlistlong");
 +    }
 +    fprintf(fp, "  tpr file\n");
 +
 +    /* Loop to create the requested number of tpr input files */
 +    for (j = 0; j < *ntprs; j++)
 +    {
 +        /* The first .tpr is the provided one, just need to modify nsteps,
 +         * so skip the following block */
 +        if (j != 0)
 +        {
 +            /* Determine which Coulomb radii rc to use in the benchmarks */
 +            add = (rmax-rmin)/(*ntprs-1);
 +            if (is_equal(rmin, info->rcoulomb[0]))
 +            {
 +                ir->rcoulomb = rmin + j*add;
 +            }
 +            else if (is_equal(rmax, info->rcoulomb[0]))
 +            {
 +                ir->rcoulomb = rmin + (j-1)*add;
 +            }
 +            else
 +            {
 +                /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 +                add          = (rmax-rmin)/(*ntprs-2);
 +                ir->rcoulomb = rmin + (j-1)*add;
 +            }
 +
 +            /* Determine the scaling factor fac */
 +            fac = ir->rcoulomb/info->rcoulomb[0];
 +
 +            /* Scale the Fourier grid spacing */
 +            ir->nkx = ir->nky = ir->nkz = 0;
 +            calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 +
 +            /* Adjust other radii since various conditions need to be fulfilled */
 +            if (eelPME == ir->coulombtype)
 +            {
 +                /* plain PME, rcoulomb must be equal to rlist */
 +                ir->rlist = ir->rcoulomb;
 +            }
 +            else
 +            {
 +                /* rlist must be >= rcoulomb, we keep the size of the buffer region */
 +                ir->rlist = ir->rcoulomb + nlist_buffer;
 +            }
 +
 +            if (bScaleRvdw && evdwCUT == ir->vdwtype)
 +            {
 +                if (ecutsVERLET == ir->cutoff_scheme)
 +                {
 +                    /* With Verlet, the van der Waals radius must always equal the Coulomb radius */
 +                    ir->rvdw = ir->rcoulomb;
 +                }
 +                else
 +                {
 +                    /* For vdw cutoff, rvdw >= rlist */
 +                    ir->rvdw = max(info->rvdw[0], ir->rlist);
 +                }
 +            }
 +
 +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +
 +        } /* end of "if (j != 0)" */
 +
 +        /* for j==0: Save the original settings
 +         * for j >0: Save modified radii and Fourier grids */
 +        info->rcoulomb[j]  = ir->rcoulomb;
 +        info->rvdw[j]      = ir->rvdw;
 +        info->nkx[j]       = ir->nkx;
 +        info->nky[j]       = ir->nky;
 +        info->nkz[j]       = ir->nkz;
 +        info->rlist[j]     = ir->rlist;
 +        info->rlistlong[j] = ir->rlistlong;
 +        info->fsx[j]       = fac*fourierspacing;
 +        info->fsy[j]       = fac*fourierspacing;
 +        info->fsz[j]       = fac*fourierspacing;
 +
 +        /* Write the benchmark tpr file */
 +        strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
 +        sprintf(buf, "_bench%.2d.tpr", j);
 +        strcat(fn_bench_tprs[j], buf);
 +        fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
 +        fprintf(stdout, "%"GMX_PRId64, ir->nsteps);
 +        if (j > 0)
 +        {
 +            fprintf(stdout, ", scaling factor %f\n", fac);
 +        }
 +        else
 +        {
 +            fprintf(stdout, ", unmodified settings\n");
 +        }
 +
 +        write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
 +
 +        /* Write information about modified tpr settings to log file */
 +        fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
 +        fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
 +        fprintf(fp, " %9f ", info->fsx[j]);
 +        if (evdwCUT == ir->vdwtype)
 +        {
 +            fprintf(fp, "%10f", ir->rvdw);
 +        }
 +        if (EPME_SWITCHED(ir->coulombtype))
 +        {
 +            fprintf(fp, "%10f", ir->rlist);
 +        }
 +        if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
 +        {
 +            fprintf(fp, "%10f", ir->rlistlong);
 +        }
 +        fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
 +
 +        /* Make it clear to the user that some additional settings were modified */
 +        if (!is_equal(ir->rvdw, info->rvdw[0])
 +            || !is_equal(ir->rlistlong, info->rlistlong[0]) )
 +        {
 +            bNote = TRUE;
 +        }
 +    }
 +    if (bNote)
 +    {
 +        fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
 +                "other input settings were also changed (see table above).\n"
 +                "Please check if the modified settings are appropriate.\n");
 +    }
 +    fflush(stdout);
 +    fflush(fp);
 +    sfree(ir);
 +}
 +
 +
 +/* Rename the files we want to keep to some meaningful filename and
 + * delete the rest */
 +static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
 +                    int nPMEnodes, int nr, gmx_bool bKeepStderr)
 +{
 +    char        numstring[STRLEN];
 +    char        newfilename[STRLEN];
 +    const char *fn = NULL;
 +    int         i;
 +    const char *opt;
 +
 +
 +    fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
 +
 +    for (i = 0; i < nfile; i++)
 +    {
 +        opt = (char *)fnm[i].opt;
 +        if (strcmp(opt, "-p") == 0)
 +        {
 +            /* do nothing; keep this file */
 +            ;
 +        }
 +        else if (strcmp(opt, "-bg") == 0)
 +        {
 +            /* Give the log file a nice name so one can later see which parameters were used */
 +            numstring[0] = '\0';
 +            if (nr > 0)
 +            {
 +                sprintf(numstring, "_%d", nr);
 +            }
 +            sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
 +            if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
 +            {
 +                fprintf(stdout, "renaming log file to %s\n", newfilename);
 +                make_backup(newfilename);
 +                rename(opt2fn("-bg", nfile, fnm), newfilename);
 +            }
 +        }
 +        else if (strcmp(opt, "-err") == 0)
 +        {
 +            /* This file contains the output of stderr. We want to keep it in
 +             * cases where there have been problems. */
 +            fn           = opt2fn(opt, nfile, fnm);
 +            numstring[0] = '\0';
 +            if (nr > 0)
 +            {
 +                sprintf(numstring, "_%d", nr);
 +            }
 +            sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
 +            if (gmx_fexist(fn))
 +            {
 +                if (bKeepStderr)
 +                {
 +                    fprintf(stdout, "Saving stderr output in %s\n", newfilename);
 +                    make_backup(newfilename);
 +                    rename(fn, newfilename);
 +                }
 +                else
 +                {
 +                    fprintf(stdout, "Deleting %s\n", fn);
 +                    remove(fn);
 +                }
 +            }
 +        }
 +        /* Delete the files which are created for each benchmark run: (options -b*) */
 +        else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
 +        {
 +            remove_if_exists(opt2fn(opt, nfile, fnm));
 +        }
 +    }
 +}
 +
 +
 +enum {
 +    eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
 +};
 +
 +/* Create a list of numbers of PME nodes to test */
 +static void make_npme_list(
 +        const char *npmevalues_opt, /* Make a complete list with all
 +                                     * possibilities or a short list that keeps only
 +                                     * reasonable numbers of PME nodes                  */
 +        int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
 +        int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
 +        int         nnodes,         /* Total number of nodes to do the tests on         */
 +        int         minPMEnodes,    /* Minimum number of PME nodes                      */
 +        int         maxPMEnodes)    /* Maximum number of PME nodes                      */
 +{
 +    int i, npme, npp;
 +    int min_factor = 1;   /* We request that npp and npme have this minimal
 +                           * largest common factor (depends on npp)           */
 +    int nlistmax;         /* Max. list size                                   */
 +    int nlist;            /* Actual number of entries in list                 */
 +    int eNPME = 0;
 +
 +
 +    /* Do we need to check all possible values for -npme or is a reduced list enough? */
 +    if (0 == strcmp(npmevalues_opt, "all") )
 +    {
 +        eNPME = eNpmeAll;
 +    }
 +    else if (0 == strcmp(npmevalues_opt, "subset") )
 +    {
 +        eNPME = eNpmeSubset;
 +    }
 +    else /* "auto" or "range" */
 +    {
 +        if (nnodes <= 64)
 +        {
 +            eNPME = eNpmeAll;
 +        }
 +        else if (nnodes < 128)
 +        {
 +            eNPME = eNpmeReduced;
 +        }
 +        else
 +        {
 +            eNPME = eNpmeSubset;
 +        }
 +    }
 +
 +    /* Calculate how many entries we could possibly have (in case of -npme all) */
 +    if (nnodes > 2)
 +    {
 +        nlistmax = maxPMEnodes - minPMEnodes + 3;
 +        if (0 == minPMEnodes)
 +        {
 +            nlistmax--;
 +        }
 +    }
 +    else
 +    {
 +        nlistmax = 1;
 +    }
 +
 +    /* Now make the actual list which is at most of size nlist */
 +    snew(*nPMEnodes, nlistmax);
 +    nlist = 0; /* start counting again, now the real entries in the list */
 +    for (i = 0; i < nlistmax - 2; i++)
 +    {
 +        npme = maxPMEnodes - i;
 +        npp  = nnodes-npme;
 +        switch (eNPME)
 +        {
 +            case eNpmeAll:
 +                min_factor = 1;
 +                break;
 +            case eNpmeReduced:
 +                min_factor = 2;
 +                break;
 +            case eNpmeSubset:
 +                /* For 2d PME we want a common largest factor of at least the cube
 +                 * root of the number of PP nodes */
 +                min_factor = (int) pow(npp, 1.0/3.0);
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
 +                break;
 +        }
 +        if (gmx_greatest_common_divisor(npp, npme) >= min_factor)
 +        {
 +            (*nPMEnodes)[nlist] = npme;
 +            nlist++;
 +        }
 +    }
 +    /* We always test 0 PME nodes and the automatic number */
 +    *nentries             = nlist + 2;
 +    (*nPMEnodes)[nlist  ] =  0;
 +    (*nPMEnodes)[nlist+1] = -1;
 +
 +    fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
 +    for (i = 0; i < *nentries-1; i++)
 +    {
 +        fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
 +    }
 +    fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
 +}
 +
 +
 +/* Allocate memory to store the performance data */
 +static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
 +{
 +    int i, j, k;
 +
 +
 +    for (k = 0; k < ntprs; k++)
 +    {
 +        snew(perfdata[k], datasets);
 +        for (i = 0; i < datasets; i++)
 +        {
 +            for (j = 0; j < repeats; j++)
 +            {
 +                snew(perfdata[k][i].Gcycles, repeats);
 +                snew(perfdata[k][i].ns_per_day, repeats);
 +                snew(perfdata[k][i].PME_f_load, repeats);
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* Check for errors on mdrun -h */
 +static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp,
 +                              const t_filenm *fnm, int nfile)
 +{
 +    const char *fn = NULL;
 +    char       *command, *msg;
 +    int         ret;
 +
 +
 +    snew(command, length +  15);
 +    snew(msg, length + 500);
 +
 +    fprintf(stdout, "Making sure the benchmarks can be executed by running just 1 step...\n");
 +    sprintf(command, "%s -nsteps 1 -quiet", mdrun_cmd_line);
 +    fprintf(stdout, "Executing '%s' ...\n", command);
 +    ret = gmx_system_call(command);
 +
 +    if (0 != ret)
 +    {
 +        /* To prevent confusion, do not again issue a gmx_fatal here since we already
 +         * get the error message from mdrun itself */
 +        sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
 +                "mdrun for the source of the problem. Did you provide a command line\n"
 +                "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
 +                "\n%s\n\n", command);
 +
 +        fprintf(stderr, "%s", msg);
 +        sep_line(fp);
 +        fprintf(fp, "%s", msg);
 +
 +        exit(ret);
 +    }
 +    fprintf(stdout, "Benchmarks can be executed!\n");
 +
 +    /* Clean up the benchmark output files we just created */
 +    fprintf(stdout, "Cleaning up ...\n");
 +    remove_if_exists(opt2fn("-bc", nfile, fnm));
 +    remove_if_exists(opt2fn("-be", nfile, fnm));
 +    remove_if_exists(opt2fn("-bcpo", nfile, fnm));
 +    remove_if_exists(opt2fn("-bg", nfile, fnm));
 +
 +    sfree(command);
 +    sfree(msg    );
 +}
 +
 +
 +static void do_the_tests(
 +        FILE           *fp,             /* General g_tune_pme output file         */
 +        char          **tpr_names,      /* Filenames of the input files to test   */
 +        int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
 +        int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
 +        int             npme_fixed,     /* If >= -1, test fixed number of PME
 +                                         * nodes only                             */
 +        const char     *npmevalues_opt, /* Which -npme values should be tested    */
 +        t_perf        **perfdata,       /* Here the performace data is stored     */
 +        int            *pmeentries,     /* Entries in the nPMEnodes list          */
 +        int             repeats,        /* Repeat each test this often            */
 +        int             nnodes,         /* Total number of nodes = nPP + nPME     */
 +        int             nr_tprs,        /* Total number of tpr files to test      */
 +        gmx_bool        bThreads,       /* Threads or MPI?                        */
 +        char           *cmd_mpirun,     /* mpirun command string                  */
 +        char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
 +        char           *cmd_mdrun,      /* mdrun command string                   */
 +        char           *cmd_args_bench, /* arguments for mdrun in a string        */
 +        const t_filenm *fnm,            /* List of filenames from command line    */
 +        int             nfile,          /* Number of files specified on the cmdl. */
 +        int             presteps,       /* DLB equilibration steps, is checked    */
 +        gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
 +{
 +    int      i, nr, k, ret, count = 0, totaltests;
 +    int     *nPMEnodes = NULL;
 +    t_perf  *pd        = NULL;
 +    int      cmdline_length;
 +    char    *command, *cmd_stub;
 +    char     buf[STRLEN];
 +    gmx_bool bResetProblem = FALSE;
 +    gmx_bool bFirst        = TRUE;
 +
 +
 +    /* This string array corresponds to the eParselog enum type at the start
 +     * of this file */
 +    const char* ParseLog[] = {
 +        "OK.",
 +        "Logfile not found!",
 +        "No timings, logfile truncated?",
 +        "Run was terminated.",
 +        "Counters were not reset properly.",
 +        "No DD grid found for these settings.",
 +        "TPX version conflict!",
 +        "mdrun was not started in parallel!",
 +        "Number of PP nodes has a prime factor that is too large.",
 +        "An error occured."
 +    };
 +    char        str_PME_f_load[13];
 +
 +
 +    /* Allocate space for the mdrun command line. 100 extra characters should
 +       be more than enough for the -npme etcetera arguments */
 +    cmdline_length =  strlen(cmd_mpirun)
 +        + strlen(cmd_np)
 +        + strlen(cmd_mdrun)
 +        + strlen(cmd_args_bench)
 +        + strlen(tpr_names[0]) + 100;
 +    snew(command, cmdline_length);
 +    snew(cmd_stub, cmdline_length);
 +
 +    /* Construct the part of the command line that stays the same for all tests: */
 +    if (bThreads)
 +    {
 +        sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
 +    }
 +    else
 +    {
 +        sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
 +    }
 +
 +    /* Create a list of numbers of PME nodes to test */
 +    if (npme_fixed < -1)
 +    {
 +        make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
 +                       nnodes, minPMEnodes, maxPMEnodes);
 +    }
 +    else
 +    {
 +        *pmeentries  = 1;
 +        snew(nPMEnodes, 1);
 +        nPMEnodes[0] = npme_fixed;
 +        fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
 +    }
 +
 +    if (0 == repeats)
 +    {
 +        fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
 +        gmx_ffclose(fp);
 +        finalize(opt2fn("-p", nfile, fnm));
 +        exit(0);
 +    }
 +
 +    /* Allocate one dataset for each tpr input file: */
 +    init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
 +
 +    /*****************************************/
 +    /* Main loop over all tpr files to test: */
 +    /*****************************************/
 +    totaltests = nr_tprs*(*pmeentries)*repeats;
 +    for (k = 0; k < nr_tprs; k++)
 +    {
 +        fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
 +        fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
 +        /* Loop over various numbers of PME nodes: */
 +        for (i = 0; i < *pmeentries; i++)
 +        {
 +            pd = &perfdata[k][i];
 +
 +            /* Loop over the repeats for each scenario: */
 +            for (nr = 0; nr < repeats; nr++)
 +            {
 +                pd->nPMEnodes = nPMEnodes[i];
 +
 +                /* Add -npme and -s to the command line and save it. Note that
 +                 * the -passall (if set) options requires cmd_args_bench to be
 +                 * at the end of the command line string */
 +                snew(pd->mdrun_cmd_line, cmdline_length);
 +                sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
 +                        cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
 +
 +                /* To prevent that all benchmarks fail due to a show-stopper argument
 +                 * on the mdrun command line, we make a quick check first */
 +                if (bFirst)
 +                {
 +                    make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
 +                }
 +                bFirst = FALSE;
 +
 +                /* Do a benchmark simulation: */
 +                if (repeats > 1)
 +                {
 +                    sprintf(buf, ", pass %d/%d", nr+1, repeats);
 +                }
 +                else
 +                {
 +                    buf[0] = '\0';
 +                }
 +                fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
 +                        (100.0*count)/totaltests,
 +                        k+1, nr_tprs, i+1, *pmeentries, buf);
 +                make_backup(opt2fn("-err", nfile, fnm));
 +                sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
 +                fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
 +                gmx_system_call(command);
 +
 +                /* Collect the performance data from the log file; also check stderr
 +                 * for fatal errors */
 +                ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
 +                                    pd, nr, presteps, cpt_steps, nnodes);
 +                if ((presteps > 0) && (ret == eParselogResetProblem))
 +                {
 +                    bResetProblem = TRUE;
 +                }
 +
 +                if (-1 == pd->nPMEnodes)
 +                {
 +                    sprintf(buf, "(%3d)", pd->guessPME);
 +                }
 +                else
 +                {
 +                    sprintf(buf, "     ");
 +                }
 +
 +                /* Nicer output */
 +                if (pd->PME_f_load[nr] > 0.0)
 +                {
 +                    sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
 +                }
 +                else
 +                {
 +                    sprintf(str_PME_f_load, "%s", "         -  ");
 +                }
 +
 +                /* Write the data we got to disk */
 +                fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
 +                        buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
 +                if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
 +                {
 +                    fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
 +                }
 +                fprintf(fp, "\n");
 +                fflush(fp);
 +                count++;
 +
 +                /* Do some cleaning up and delete the files we do not need any more */
 +                cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
 +
 +                /* If the first run with this number of processors already failed, do not try again: */
 +                if (pd->Gcycles[0] <= 0.0 && repeats > 1)
 +                {
 +                    fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
 +                    count += repeats-(nr+1);
 +                    break;
 +                }
 +            } /* end of repeats loop */
 +        }     /* end of -npme loop */
 +    }         /* end of tpr file loop */
 +
 +    if (bResetProblem)
 +    {
 +        sep_line(fp);
 +        fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
 +        sep_line(fp);
 +    }
 +    sfree(command);
 +    sfree(cmd_stub);
 +}
 +
 +
 +static void check_input(
 +        int             nnodes,
 +        int             repeats,
 +        int            *ntprs,
 +        real           *rmin,
 +        real            rcoulomb,
 +        real           *rmax,
 +        real            maxPMEfraction,
 +        real            minPMEfraction,
 +        int             npme_fixed,
 +        gmx_int64_t     bench_nsteps,
 +        const t_filenm *fnm,
 +        int             nfile,
 +        int             sim_part,
 +        int             presteps,
 +        int             npargs,
 +        t_pargs        *pa)
 +{
 +    int old;
 +
 +
 +    /* Make sure the input file exists */
 +    if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
 +    {
 +        gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
 +    }
 +
 +    /* Make sure that the checkpoint file is not overwritten during benchmarking */
 +    if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
 +    {
 +        gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
 +                  "The checkpoint input file must not be overwritten during the benchmarks.\n");
 +    }
 +
 +    /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
 +    if (repeats < 0)
 +    {
 +        gmx_fatal(FARGS, "Number of repeats < 0!");
 +    }
 +
 +    /* Check number of nodes */
 +    if (nnodes < 1)
 +    {
 +        gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
 +    }
 +
 +    /* Automatically choose -ntpr if not set */
 +    if (*ntprs < 1)
 +    {
 +        if (nnodes < 16)
 +        {
 +            *ntprs = 1;
 +        }
 +        else
 +        {
 +            *ntprs = 3;
 +            /* Set a reasonable scaling factor for rcoulomb */
 +            if (*rmax <= 0)
 +            {
 +                *rmax = rcoulomb * 1.2;
 +            }
 +        }
 +        fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
 +    }
 +    else
 +    {
 +        if (1 == *ntprs)
 +        {
 +            fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
 +        }
 +    }
 +
 +    /* Make shure that rmin <= rcoulomb <= rmax */
 +    if (*rmin <= 0)
 +    {
 +        *rmin = rcoulomb;
 +    }
 +    if (*rmax <= 0)
 +    {
 +        *rmax = rcoulomb;
 +    }
 +    if (!(*rmin <= *rmax) )
 +    {
 +        gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
 +                  "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
 +    }
 +    /* Add test scenarios if rmin or rmax were set */
 +    if (*ntprs <= 2)
 +    {
 +        if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
 +        {
 +            (*ntprs)++;
 +            fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
 +                    *rmin, *ntprs);
 +        }
 +        if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
 +        {
 +            (*ntprs)++;
 +            fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
 +                    *rmax, *ntprs);
 +        }
 +    }
 +    old = *ntprs;
 +    /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
 +    if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
 +    {
 +        *ntprs = max(*ntprs, 2);
 +    }
 +
 +    /* If both rmin, rmax are set, we need 3 tpr files at minimum */
 +    if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
 +    {
 +        *ntprs = max(*ntprs, 3);
 +    }
 +
 +    if (old != *ntprs)
 +    {
 +        fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
 +    }
 +
 +    if (*ntprs > 1)
 +    {
 +        if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
 +        {
 +            fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
 +                    "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
 +                    "with correspondingly adjusted PME grid settings\n");
 +            *ntprs = 1;
 +        }
 +    }
 +
 +    /* Check whether max and min fraction are within required values */
 +    if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
 +    {
 +        gmx_fatal(FARGS, "-max must be between 0 and 0.5");
 +    }
 +    if (minPMEfraction > 0.5 || minPMEfraction < 0)
 +    {
 +        gmx_fatal(FARGS, "-min must be between 0 and 0.5");
 +    }
 +    if (maxPMEfraction < minPMEfraction)
 +    {
 +        gmx_fatal(FARGS, "-max must be larger or equal to -min");
 +    }
 +
 +    /* Check whether the number of steps - if it was set - has a reasonable value */
 +    if (bench_nsteps < 0)
 +    {
 +        gmx_fatal(FARGS, "Number of steps must be positive.");
 +    }
 +
 +    if (bench_nsteps > 10000 || bench_nsteps < 100)
 +    {
 +        fprintf(stderr, "WARNING: steps=");
 +        fprintf(stderr, "%"GMX_PRId64, bench_nsteps);
 +        fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
 +    }
 +
 +    if (presteps < 0)
 +    {
 +        gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
 +    }
 +
 +    /* Check for rcoulomb scaling if more than one .tpr file is tested */
 +    if (*ntprs > 1)
 +    {
 +        if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
 +        {
 +            fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
 +        }
 +    }
 +
 +    /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
 +     * only. We need to check whether the requested number of PME-only nodes
 +     * makes sense. */
 +    if (npme_fixed > -1)
 +    {
 +        /* No more than 50% of all nodes can be assigned as PME-only nodes. */
 +        if (2*npme_fixed > nnodes)
 +        {
 +            gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
 +                      nnodes/2, nnodes, npme_fixed);
 +        }
 +        if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
 +        {
 +            fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
 +                    100.0*((real)npme_fixed / (real)nnodes));
 +        }
 +        if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
 +        {
 +            fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
 +                    "      fixed number of PME-only nodes is requested with -fix.\n");
 +        }
 +    }
 +}
 +
 +
 +/* Returns TRUE when "opt" is needed at launch time */
 +static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
 +{
 +    if (0 == strncmp(opt, "-swap", 5))
 +    {
 +        return bSet;
 +    }
 +
 +    /* Apart from the input .tpr and the output log files we need all options that
 +     * were set on the command line and that do not start with -b */
 +    if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
 +           || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
 +    {
 +        return FALSE;
 +    }
 +
 +    return bSet;
 +}
 +
 +
 +/* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
 +static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
 +{
 +    /* Apart from the input .tpr, all files starting with "-b" are for
 +     * _b_enchmark files exclusively */
 +    if (0 == strncmp(opt, "-s", 2))
 +    {
 +        return FALSE;
 +    }
 +
 +    if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
 +    {
 +        if (!bOptional || bSet)
 +        {
 +            return TRUE;
 +        }
 +        else
 +        {
 +            return FALSE;
 +        }
 +    }
 +    else
 +    {
 +        if (bIsOutput)
 +        {
 +            return FALSE;
 +        }
 +        else
 +        {
 +            if (bSet) /* These are additional input files like -cpi -ei */
 +            {
 +                return TRUE;
 +            }
 +            else
 +            {
 +                return FALSE;
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* Adds 'buf' to 'str' */
 +static void add_to_string(char **str, char *buf)
 +{
 +    int len;
 +
 +
 +    len = strlen(*str) + strlen(buf) + 1;
 +    srenew(*str, len);
 +    strcat(*str, buf);
 +}
 +
 +
 +/* Create the command line for the benchmark as well as for the real run */
 +static void create_command_line_snippets(
 +        gmx_bool  bAppendFiles,
 +        gmx_bool  bKeepAndNumCPT,
 +        gmx_bool  bResetHWay,
 +        int       presteps,
 +        int       nfile,
 +        t_filenm  fnm[],
 +        char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
 +        char     *cmd_args_launch[], /* command line arguments for simulation run */
 +        char      extra_args[])      /* Add this to the end of the command line */
 +{
 +    int         i;
 +    char       *opt;
 +    const char *name;
 +    char        strbuf[STRLEN];
 +
 +
 +    /* strlen needs at least '\0' as a string: */
 +    snew(*cmd_args_bench, 1);
 +    snew(*cmd_args_launch, 1);
 +    *cmd_args_launch[0] = '\0';
 +    *cmd_args_bench[0]  = '\0';
 +
 +
 +    /*******************************************/
 +    /* 1. Process other command line arguments */
 +    /*******************************************/
 +    if (presteps > 0)
 +    {
 +        /* Add equilibration steps to benchmark options */
 +        sprintf(strbuf, "-resetstep %d ", presteps);
 +        add_to_string(cmd_args_bench, strbuf);
 +    }
 +    /* These switches take effect only at launch time */
 +    if (FALSE == bAppendFiles)
 +    {
 +        add_to_string(cmd_args_launch, "-noappend ");
 +    }
 +    if (bKeepAndNumCPT)
 +    {
 +        add_to_string(cmd_args_launch, "-cpnum ");
 +    }
 +    if (bResetHWay)
 +    {
 +        add_to_string(cmd_args_launch, "-resethway ");
 +    }
 +
 +    /********************/
 +    /* 2. Process files */
 +    /********************/
 +    for (i = 0; i < nfile; i++)
 +    {
 +        opt  = (char *)fnm[i].opt;
 +        name = opt2fn(opt, nfile, fnm);
 +
 +        /* Strbuf contains the options, now let's sort out where we need that */
 +        sprintf(strbuf, "%s %s ", opt, name);
 +
 +        if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
 +        {
 +            /* All options starting with -b* need the 'b' removed,
 +             * therefore overwrite strbuf */
 +            if (0 == strncmp(opt, "-b", 2))
 +            {
 +                sprintf(strbuf, "-%s %s ", &opt[2], name);
 +            }
 +
 +            add_to_string(cmd_args_bench, strbuf);
 +        }
 +
 +        if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
 +        {
 +            add_to_string(cmd_args_launch, strbuf);
 +        }
 +    }
 +
 +    add_to_string(cmd_args_bench, extra_args);
 +    add_to_string(cmd_args_launch, extra_args);
 +}
 +
 +
 +/* Set option opt */
 +static void setopt(const char *opt, int nfile, t_filenm fnm[])
 +{
 +    int i;
 +
 +    for (i = 0; (i < nfile); i++)
 +    {
 +        if (strcmp(opt, fnm[i].opt) == 0)
 +        {
 +            fnm[i].flag |= ffSET;
 +        }
 +    }
 +}
 +
 +
 +/* This routine inspects the tpr file and ...
 + * 1. checks for output files that get triggered by a tpr option. These output
 + *    files are marked as 'set' to allow for a proper cleanup after each
 + *    tuning run.
 + * 2. returns the PME:PP load ratio
 + * 3. returns rcoulomb from the tpr */
 +static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
 +{
 +    gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
 +    gmx_bool     bTpi;      /* Is test particle insertion requested?          */
 +    gmx_bool     bFree;     /* Is a free energy simulation requested?         */
 +    gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
 +    gmx_bool     bSwap;     /* Is water/ion position swapping requested?      */
 +    t_inputrec   ir;
 +    t_state      state;
 +    gmx_mtop_t   mtop;
 +
 +
 +    /* Check tpr file for options that trigger extra output files */
 +    read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
 +    bPull = (epullNO != ir.ePull);
 +    bFree = (efepNO  != ir.efep );
 +    bNM   = (eiNM    == ir.eI   );
 +    bSwap = (eswapNO != ir.eSwapCoords);
 +    bTpi  = EI_TPI(ir.eI);
 +
 +    /* Set these output files on the tuning command-line */
 +    if (bPull)
 +    {
 +        setopt("-pf", nfile, fnm);
 +        setopt("-px", nfile, fnm);
 +    }
 +    if (bFree)
 +    {
 +        setopt("-dhdl", nfile, fnm);
 +    }
 +    if (bTpi)
 +    {
 +        setopt("-tpi", nfile, fnm);
 +        setopt("-tpid", nfile, fnm);
 +    }
 +    if (bNM)
 +    {
 +        setopt("-mtx", nfile, fnm);
 +    }
 +    if (bSwap)
 +    {
 +        setopt("-swap", nfile, fnm);
 +    }
 +
 +    *rcoulomb = ir.rcoulomb;
 +
 +    /* Return the estimate for the number of PME nodes */
 +    return pme_load_estimate(&mtop, &ir, state.box);
 +}
 +
 +
 +static void couple_files_options(int nfile, t_filenm fnm[])
 +{
 +    int      i;
 +    gmx_bool bSet, bBench;
 +    char    *opt;
 +    char     buf[20];
 +
 +
 +    for (i = 0; i < nfile; i++)
 +    {
 +        opt    = (char *)fnm[i].opt;
 +        bSet   = ((fnm[i].flag & ffSET) != 0);
 +        bBench = (0 == strncmp(opt, "-b", 2));
 +
 +        /* Check optional files */
 +        /* If e.g. -eo is set, then -beo also needs to be set */
 +        if (is_optional(&fnm[i]) && bSet && !bBench)
 +        {
 +            sprintf(buf, "-b%s", &opt[1]);
 +            setopt(buf, nfile, fnm);
 +        }
 +        /* If -beo is set, then -eo also needs to be! */
 +        if (is_optional(&fnm[i]) && bSet && bBench)
 +        {
 +            sprintf(buf, "-%s", &opt[2]);
 +            setopt(buf, nfile, fnm);
 +        }
 +    }
 +}
 +
 +
 +#define BENCHSTEPS (1000)
 +
 +int gmx_tune_pme(int argc, char *argv[])
 +{
 +    const char     *desc[] = {
 +        "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, [THISMODULE] systematically",
 +        "times [gmx-mdrun] with various numbers of PME-only nodes and determines",
 +        "which setting is fastest. It will also test whether performance can",
 +        "be enhanced by shifting load from the reciprocal to the real space",
 +        "part of the Ewald sum. ",
 +        "Simply pass your [TT].tpr[tt] file to [THISMODULE] together with other options",
 +        "for [gmx-mdrun] as needed.[PAR]",
 +        "Which executables are used can be set in the environment variables",
 +        "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
 +        "will be used as defaults. Note that for certain MPI frameworks you",
 +        "need to provide a machine- or hostfile. This can also be passed",
 +        "via the MPIRUN variable, e.g.[PAR]",
 +        "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
 +        "Please call [THISMODULE] with the normal options you would pass to",
 +        "[gmx-mdrun] and add [TT]-np[tt] for the number of processors to perform the",
 +        "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
 +        "to repeat each test several times to get better statistics. [PAR]",
 +        "[THISMODULE] can test various real space / reciprocal space workloads",
 +        "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
 +        "written with enlarged cutoffs and smaller Fourier grids respectively.",
 +        "Typically, the first test (number 0) will be with the settings from the input",
 +        "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
 +        "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
 +        "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
 +        "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
 +        "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
 +        "if you just seek the optimal number of PME-only nodes; in that case",
 +        "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
 +        "For the benchmark runs, the default of 1000 time steps should suffice for most",
 +        "MD systems. The dynamic load balancing needs about 100 time steps",
 +        "to adapt to local load imbalances, therefore the time step counters",
 +        "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
 +        "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
 +        "From the 'DD' load imbalance entries in the md.log output file you",
 +        "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
 +        "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
 +        "After calling [gmx-mdrun] several times, detailed performance information",
 +        "is available in the output file [TT]perf.out[tt].",
 +        "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
 +        "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
 +        "If you want the simulation to be started automatically with the",
 +        "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
 +    };
 +
 +    int             nnodes         = 1;
 +    int             repeats        = 2;
 +    int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
 +    real            maxPMEfraction = 0.50;
 +    real            minPMEfraction = 0.25;
 +    int             maxPMEnodes, minPMEnodes;
 +    float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
 +    float           guessPMEnodes;
 +    int             npme_fixed     = -2;              /* If >= -1, use only this number
 +                                                       * of PME-only nodes                */
 +    int             ntprs          = 0;
 +    real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
 +    real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
 +    gmx_bool        bScaleRvdw     = TRUE;
 +    gmx_int64_t     bench_nsteps   = BENCHSTEPS;
 +    gmx_int64_t     new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
 +    gmx_int64_t     cpt_steps      = 0;   /* Step counter in .cpt input file   */
 +    int             presteps       = 100; /* Do a full cycle reset after presteps steps */
 +    gmx_bool        bOverwrite     = FALSE, bKeepTPR;
 +    gmx_bool        bLaunch        = FALSE;
 +    char           *ExtraArgs      = NULL;
 +    char          **tpr_names      = NULL;
 +    const char     *simulation_tpr = NULL;
 +    int             best_npme, best_tpr;
 +    int             sim_part = 1; /* For benchmarks with checkpoint files */
 +    char            bbuf[STRLEN];
 +
 +    /* Default program names if nothing else is found */
 +    char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
 +    char         *cmd_args_bench, *cmd_args_launch;
 +    char         *cmd_np = NULL;
 +
 +    t_perf      **perfdata = NULL;
 +    t_inputinfo  *info;
 +    int           i;
 +    FILE         *fp;
 +    t_commrec    *cr;
 +
 +    /* Print out how long the tuning took */
 +    double          seconds;
 +
 +    static t_filenm fnm[] = {
 +        /* g_tune_pme */
 +        { efOUT, "-p",      "perf",     ffWRITE },
 +        { efLOG, "-err",    "bencherr", ffWRITE },
 +        { efTPX, "-so",     "tuned",    ffWRITE },
 +        /* mdrun: */
 +        { efTPX, NULL,      NULL,       ffREAD },
 +        { efTRN, "-o",      NULL,       ffWRITE },
 +        { efCOMPRESSED, "-x", NULL,     ffOPTWR },
 +        { efCPT, "-cpi",    NULL,       ffOPTRD },
 +        { efCPT, "-cpo",    NULL,       ffOPTWR },
 +        { efSTO, "-c",      "confout",  ffWRITE },
 +        { efEDR, "-e",      "ener",     ffWRITE },
 +        { efLOG, "-g",      "md",       ffWRITE },
 +        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
 +        { efXVG, "-field",  "field",    ffOPTWR },
 +        { efXVG, "-table",  "table",    ffOPTRD },
 +        { efXVG, "-tabletf", "tabletf",   ffOPTRD },
 +        { efXVG, "-tablep", "tablep",   ffOPTRD },
 +        { efXVG, "-tableb", "table",    ffOPTRD },
 +        { efTRX, "-rerun",  "rerun",    ffOPTRD },
 +        { efXVG, "-tpi",    "tpi",      ffOPTWR },
 +        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
 +        { efEDI, "-ei",     "sam",      ffOPTRD },
 +        { efXVG, "-eo",     "edsam",    ffOPTWR },
 +        { efXVG, "-devout", "deviatie", ffOPTWR },
 +        { efXVG, "-runav",  "runaver",  ffOPTWR },
 +        { efXVG, "-px",     "pullx",    ffOPTWR },
 +        { efXVG, "-pf",     "pullf",    ffOPTWR },
 +        { efXVG, "-ro",     "rotation", ffOPTWR },
 +        { efLOG, "-ra",     "rotangles", ffOPTWR },
 +        { efLOG, "-rs",     "rotslabs", ffOPTWR },
 +        { efLOG, "-rt",     "rottorque", ffOPTWR },
 +        { efMTX, "-mtx",    "nm",       ffOPTWR },
 +        { efNDX, "-dn",     "dipole",   ffOPTWR },
 +        { efXVG, "-swap",   "swapions", ffOPTWR },
 +        /* Output files that are deleted after each benchmark run */
 +        { efTRN, "-bo",     "bench",    ffWRITE },
 +        { efXTC, "-bx",     "bench",    ffWRITE },
 +        { efCPT, "-bcpo",   "bench",    ffWRITE },
 +        { efSTO, "-bc",     "bench",    ffWRITE },
 +        { efEDR, "-be",     "bench",    ffWRITE },
 +        { efLOG, "-bg",     "bench",    ffWRITE },
 +        { efXVG, "-beo",    "benchedo", ffOPTWR },
 +        { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
 +        { efXVG, "-bfield", "benchfld", ffOPTWR },
 +        { efXVG, "-btpi",   "benchtpi", ffOPTWR },
 +        { efXVG, "-btpid",  "benchtpid", ffOPTWR },
 +        { efXVG, "-bdevout", "benchdev", ffOPTWR },
 +        { efXVG, "-brunav", "benchrnav", ffOPTWR },
 +        { efXVG, "-bpx",    "benchpx",  ffOPTWR },
 +        { efXVG, "-bpf",    "benchpf",  ffOPTWR },
 +        { efXVG, "-bro",    "benchrot", ffOPTWR },
 +        { efLOG, "-bra",    "benchrota", ffOPTWR },
 +        { efLOG, "-brs",    "benchrots", ffOPTWR },
 +        { efLOG, "-brt",    "benchrott", ffOPTWR },
 +        { efMTX, "-bmtx",   "benchn",   ffOPTWR },
 +        { efNDX, "-bdn",    "bench",    ffOPTWR },
 +        { efXVG, "-bswap",  "benchswp", ffOPTWR }
 +    };
 +
 +    gmx_bool        bThreads     = FALSE;
 +
 +    int             nthreads = 1;
 +
 +    const char     *procstring[] =
 +    { NULL, "-np", "-n", "none", NULL };
 +    const char     *npmevalues_opt[] =
 +    { NULL, "auto", "all", "subset", NULL };
 +
 +    gmx_bool     bAppendFiles          = TRUE;
 +    gmx_bool     bKeepAndNumCPT        = FALSE;
 +    gmx_bool     bResetCountersHalfWay = FALSE;
 +    gmx_bool     bBenchmark            = TRUE;
 +
 +    output_env_t oenv = NULL;
 +
 +    t_pargs      pa[] = {
 +        /***********************/
 +        /* g_tune_pme options: */
 +        /***********************/
 +        { "-np",       FALSE, etINT,  {&nnodes},
 +          "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
 +        { "-npstring", FALSE, etENUM, {procstring},
 +          "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
 +        { "-ntmpi",    FALSE, etINT,  {&nthreads},
 +          "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
 +        { "-r",        FALSE, etINT,  {&repeats},
 +          "Repeat each test this often" },
 +        { "-max",      FALSE, etREAL, {&maxPMEfraction},
 +          "Max fraction of PME nodes to test with" },
 +        { "-min",      FALSE, etREAL, {&minPMEfraction},
 +          "Min fraction of PME nodes to test with" },
 +        { "-npme",     FALSE, etENUM, {npmevalues_opt},
 +          "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
 +          "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
 +        { "-fix",      FALSE, etINT,  {&npme_fixed},
 +          "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
 +        { "-rmax",     FALSE, etREAL, {&rmax},
 +          "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
 +        { "-rmin",     FALSE, etREAL, {&rmin},
 +          "If >0, minimal rcoulomb for -ntpr>1" },
 +        { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
 +          "Scale rvdw along with rcoulomb"},
 +        { "-ntpr",     FALSE, etINT,  {&ntprs},
 +          "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
 +          "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
 +        { "-steps",    FALSE, etINT64, {&bench_nsteps},
 +          "Take timings for this many steps in the benchmark runs" },
 +        { "-resetstep", FALSE, etINT,  {&presteps},
 +          "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
 +        { "-simsteps", FALSE, etINT64, {&new_sim_nsteps},
 +          "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
 +        { "-launch",   FALSE, etBOOL, {&bLaunch},
 +          "Launch the real simulation after optimization" },
 +        { "-bench",    FALSE, etBOOL, {&bBenchmark},
 +          "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
 +        /******************/
 +        /* mdrun options: */
 +        /******************/
 +        /* We let g_tune_pme parse and understand these options, because we need to
 +         * prevent that they appear on the mdrun command line for the benchmarks */
 +        { "-append",   FALSE, etBOOL, {&bAppendFiles},
 +          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
 +        { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
 +          "Keep and number checkpoint files (launch only)" },
 +        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
 +          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
 +    };
 +
 +#define NFILE asize(fnm)
 +
 +    seconds = gmx_gettime();
 +
 +    if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
 +                           NFILE, fnm, asize(pa), pa, asize(desc), desc,
 +                           0, NULL, &oenv))
 +    {
 +        return 0;
 +    }
 +
 +    /* Store the remaining unparsed command line entries in a string which
 +     * is then attached to the mdrun command line */
 +    snew(ExtraArgs, 1);
 +    ExtraArgs[0] = '\0';
 +    for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
 +    {
 +        add_to_string(&ExtraArgs, argv[i]);
 +        add_to_string(&ExtraArgs, " ");
 +    }
 +
 +    if (opt2parg_bSet("-ntmpi", asize(pa), pa))
 +    {
 +        bThreads = TRUE;
 +        if (opt2parg_bSet("-npstring", asize(pa), pa))
 +        {
 +            fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
 +        }
 +
 +        if (nnodes > 1)
 +        {
 +            gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
 +        }
 +        /* and now we just set this; a bit of an ugly hack*/
 +        nnodes = nthreads;
 +    }
 +    /* Check for PME:PP ratio and whether tpr triggers additional output files */
 +    guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
 +
 +    /* Automatically set -beo options if -eo is set etc. */
 +    couple_files_options(NFILE, fnm);
 +
 +    /* Construct the command line arguments for benchmark runs
 +     * as well as for the simulation run */
 +    if (bThreads)
 +    {
 +        sprintf(bbuf, " -ntmpi %d ", nthreads);
 +    }
 +    else
 +    {
 +        /* This string will be used for MPI runs and will appear after the
 +         * mpirun command. */
 +        if (strcmp(procstring[0], "none") != 0)
 +        {
 +            sprintf(bbuf, " %s %d ", procstring[0], nnodes);
 +        }
 +        else
 +        {
 +            sprintf(bbuf, " ");
 +        }
 +    }
 +
 +    cmd_np = bbuf;
 +
 +    create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
 +                                 NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
 +
 +    /* Read in checkpoint file if requested */
 +    sim_part = 1;
 +    if (opt2bSet("-cpi", NFILE, fnm))
 +    {
 +        snew(cr, 1);
 +        cr->duty = DUTY_PP; /* makes the following routine happy */
 +        read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
 +                                        &sim_part, &cpt_steps, cr,
 +                                        FALSE, NFILE, fnm, NULL, NULL);
 +        sfree(cr);
 +        sim_part++;
 +        /* sim_part will now be 1 if no checkpoint file was found */
 +        if (sim_part <= 1)
 +        {
 +            gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
 +        }
 +    }
 +
 +    /* Open performance output file and write header info */
 +    fp = gmx_ffopen(opt2fn("-p", NFILE, fnm), "w");
 +
 +    /* Make a quick consistency check of command line parameters */
 +    check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
 +                maxPMEfraction, minPMEfraction, npme_fixed,
 +                bench_nsteps, fnm, NFILE, sim_part, presteps,
 +                asize(pa), pa);
 +
 +    /* Determine the maximum and minimum number of PME nodes to test,
 +     * the actual list of settings is build in do_the_tests(). */
 +    if ((nnodes > 2) && (npme_fixed < -1))
 +    {
 +        if (0 == strcmp(npmevalues_opt[0], "auto"))
 +        {
 +            /* Determine the npme range automatically based on the PME:PP load guess */
 +            if (guessPMEratio > 1.0)
 +            {
 +                /* More PME than PP work, probably we do not need separate PME nodes at all! */
 +                maxPMEnodes = nnodes/2;
 +                minPMEnodes = nnodes/2;
 +            }
 +            else
 +            {
 +                /* PME : PP load is in the range 0..1, let's test around the guess */
 +                guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
 +                minPMEnodes   = floor(0.7*guessPMEnodes);
 +                maxPMEnodes   =  ceil(1.6*guessPMEnodes);
 +                maxPMEnodes   = min(maxPMEnodes, nnodes/2);
 +            }
 +        }
 +        else
 +        {
 +            /* Determine the npme range based on user input */
 +            maxPMEnodes = floor(maxPMEfraction*nnodes);
 +            minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
 +            fprintf(stdout, "Will try runs with %d ", minPMEnodes);
 +            if (maxPMEnodes != minPMEnodes)
 +            {
 +                fprintf(stdout, "- %d ", maxPMEnodes);
 +            }
 +            fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
 +        }
 +    }
 +    else
 +    {
 +        maxPMEnodes = 0;
 +        minPMEnodes = 0;
 +    }
 +
 +    /* Get the commands we need to set up the runs from environment variables */
 +    get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
 +    if (bBenchmark && repeats > 0)
 +    {
 +        check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
 +    }
 +
 +    /* Print some header info to file */
 +    sep_line(fp);
 +    fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
 +    sep_line(fp);
 +    fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
 +    if (!bThreads)
 +    {
 +        fprintf(fp, "Number of nodes         : %d\n", nnodes);
 +        fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
 +        if (strcmp(procstring[0], "none") != 0)
 +        {
 +            fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
 +        }
 +        else
 +        {
 +            fprintf(fp, "Not setting number of nodes in system call\n");
 +        }
 +    }
 +    else
 +    {
 +        fprintf(fp, "Number of threads       : %d\n", nnodes);
 +    }
 +
 +    fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
 +    fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
 +    fprintf(fp, "Benchmark steps         : ");
 +    fprintf(fp, "%"GMX_PRId64, bench_nsteps);
 +    fprintf(fp, "\n");
 +    fprintf(fp, "dlb equilibration steps : %d\n", presteps);
 +    if (sim_part > 1)
 +    {
 +        fprintf(fp, "Checkpoint time step    : ");
 +        fprintf(fp, "%"GMX_PRId64, cpt_steps);
 +        fprintf(fp, "\n");
 +    }
 +    fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
 +
 +    if (new_sim_nsteps >= 0)
 +    {
 +        bOverwrite = TRUE;
 +        fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
 +        fprintf(stderr, "%"GMX_PRId64, new_sim_nsteps+cpt_steps);
 +        fprintf(stderr, " steps.\n");
 +        fprintf(fp, "Simulation steps        : ");
 +        fprintf(fp, "%"GMX_PRId64, new_sim_nsteps);
 +        fprintf(fp, "\n");
 +    }
 +    if (repeats > 1)
 +    {
 +        fprintf(fp, "Repeats for each test   : %d\n", repeats);
 +    }
 +
 +    if (npme_fixed >= -1)
 +    {
 +        fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
 +    }
 +
 +    fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
 +    fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
 +
 +    /* Allocate memory for the inputinfo struct: */
 +    snew(info, 1);
 +    info->nr_inputfiles = ntprs;
 +    for (i = 0; i < ntprs; i++)
 +    {
 +        snew(info->rcoulomb, ntprs);
 +        snew(info->rvdw, ntprs);
 +        snew(info->rlist, ntprs);
 +        snew(info->rlistlong, ntprs);
 +        snew(info->nkx, ntprs);
 +        snew(info->nky, ntprs);
 +        snew(info->nkz, ntprs);
 +        snew(info->fsx, ntprs);
 +        snew(info->fsy, ntprs);
 +        snew(info->fsz, ntprs);
 +    }
 +    /* Make alternative tpr files to test: */
 +    snew(tpr_names, ntprs);
 +    for (i = 0; i < ntprs; i++)
 +    {
 +        snew(tpr_names[i], STRLEN);
 +    }
 +
 +    /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
 +     * different grids could be found. */
 +    make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
 +                        cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
 +
 +    /********************************************************************************/
 +    /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
 +    /********************************************************************************/
 +    snew(perfdata, ntprs);
 +    if (bBenchmark)
 +    {
 +        do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
 +                     repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
 +                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
 +
 +        fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
 +
 +        /* Analyse the results and give a suggestion for optimal settings: */
 +        bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
 +                                repeats, info, &best_tpr, &best_npme);
 +
 +        /* Take the best-performing tpr file and enlarge nsteps to original value */
 +        if (bKeepTPR && !bOverwrite)
 +        {
 +            simulation_tpr = opt2fn("-s", NFILE, fnm);
 +        }
 +        else
 +        {
 +            simulation_tpr = opt2fn("-so", NFILE, fnm);
 +            modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
 +                               info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
 +        }
 +
 +        /* Let's get rid of the temporary benchmark input files */
 +        for (i = 0; i < ntprs; i++)
 +        {
 +            fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
 +            remove(tpr_names[i]);
 +        }
 +
 +        /* Now start the real simulation if the user requested it ... */
 +        launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
 +                          cmd_args_launch, simulation_tpr, best_npme);
 +    }
 +    gmx_ffclose(fp);
 +
 +    /* ... or simply print the performance results to screen: */
 +    if (!bLaunch)
 +    {
 +        finalize(opt2fn("-p", NFILE, fnm));
 +    }
 +
 +    return 0;
 +}
index 6a93fff21614f4fd6697213b86561b6a0184886b,0000000000000000000000000000000000000000..3c0880c8c3547743675ce5f6675871051ed6adc4
mode 100644,000000..100644
--- /dev/null
@@@ -1,679 -1,0 +1,686 @@@
-         /* Give up, pick a generic one instead */
-         if (nl->kernelptr_vf == NULL)
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +
 +#include "thread_mpi/threads.h"
 +
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "ns.h"
 +#include "vec.h"
 +#include "gromacs/math/utilities.h"
 +#include "macros.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "force.h"
 +#include "names.h"
 +#include "main.h"
 +#include "xvgr.h"
 +#include "gmx_fatal.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "nrnb.h"
 +#include "nonbonded.h"
 +#include "gromacs/simd/simd.h"
 +
 +#include "nb_kernel.h"
 +#include "nb_free_energy.h"
 +#include "nb_generic.h"
 +#include "nb_generic_cg.h"
 +#include "nb_generic_adress.h"
 +
 +/* Different default (c) and SIMD instructions interaction-specific kernels */
 +#include "nb_kernel_c/nb_kernel_c.h"
 +
 +#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
 +#endif
 +#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.h"
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
 +#endif
 +#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
 +#    include "nb_kernel_sse2_double/nb_kernel_sse2_double.h"
 +#endif
 +#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE)
 +#    include "nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.h"
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h"
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
 +#endif
 +#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
 +#    include "nb_kernel_sparc64_hpc_ace_double/nb_kernel_sparc64_hpc_ace_double.h"
 +#endif
 +
 +
 +static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 +static gmx_bool            nonbonded_setup_done  = FALSE;
 +
 +
 +void
 +gmx_nonbonded_setup(t_forcerec *   fr,
 +                    gmx_bool       bGenericKernelOnly)
 +{
 +    tMPI_Thread_mutex_lock(&nonbonded_setup_mutex);
 +    /* Here we are guaranteed only one thread made it. */
 +    if (nonbonded_setup_done == FALSE)
 +    {
 +        if (bGenericKernelOnly == FALSE)
 +        {
 +            /* Add the generic kernels to the structure stored statically in nb_kernel.c */
 +            nb_kernel_list_add_kernels(kernellist_c, kernellist_c_size);
 +
 +            if (!(fr != NULL && fr->use_simd_kernels == FALSE))
 +            {
 +                /* Add interaction-specific kernels for different architectures */
 +                /* Single precision */
 +#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse2_single, kernellist_sse2_single_size);
 +#endif
 +#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse4_1_single, kernellist_sse4_1_single_size);
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_128_fma_single, kernellist_avx_128_fma_single_size);
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_256_single, kernellist_avx_256_single_size);
 +#endif
 +                /* Double precision */
 +#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse2_double, kernellist_sse2_double_size);
 +#endif
 +#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse4_1_double, kernellist_sse4_1_double_size);
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_128_fma_double, kernellist_avx_128_fma_double_size);
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
 +#endif
 +#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sparc64_hpc_ace_double, kernellist_sparc64_hpc_ace_double_size);
 +#endif
 +                ; /* empty statement to avoid a completely empty block */
 +            }
 +        }
 +        /* Create a hash for faster lookups */
 +        nb_kernel_list_hash_init();
 +
 +        nonbonded_setup_done = TRUE;
 +    }
 +    tMPI_Thread_mutex_unlock(&nonbonded_setup_mutex);
 +}
 +
 +
 +
 +void
 +gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
 +{
 +    const char *     elec;
 +    const char *     elec_mod;
 +    const char *     vdw;
 +    const char *     vdw_mod;
 +    const char *     geom;
 +    const char *     other;
 +    const char *     vf;
 +
 +    struct
 +    {
 +        const char *  arch;
 +        int           simd_padding_width;
 +    }
 +    arch_and_padding[] =
 +    {
 +        /* Single precision */
 +#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
 +        { "avx_256_single", 8 },
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +        { "avx_128_fma_single", 4 },
 +#endif
 +#if (defined GMX_SIMD_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +        { "sse4_1_single", 4 },
 +#endif
 +#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
 +        { "sse2_single", 4 },
 +#endif
 +        /* Double precision */
 +#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
 +        { "avx_256_double", 4 },
 +#endif
 +#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +        /* Sic. Double precision 2-way SIMD does not require neighbor list padding,
 +         * since the kernels execute a loop unrolled a factor 2, followed by
 +         * a possible single odd-element epilogue.
 +         */
 +        { "avx_128_fma_double", 1 },
 +#endif
 +#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sse2_double", 1 },
 +#endif
 +#if (defined GMX_SIMD_X86_SSE4_1 && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sse4_1_double", 1 },
 +#endif
 +#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sparc64_hpc_ace_double", 1 },
 +#endif
 +        { "c", 1 },
 +    };
 +    int              narch = asize(arch_and_padding);
 +    int              i;
 +
 +    if (nonbonded_setup_done == FALSE)
 +    {
 +        /* We typically call this setup routine before starting timers,
 +         * but if that has not been done for whatever reason we do it now.
 +         */
 +        gmx_nonbonded_setup(NULL, FALSE);
 +    }
 +
 +    /* Not used yet */
 +    other = "";
 +
 +    nl->kernelptr_vf = NULL;
 +    nl->kernelptr_v  = NULL;
 +    nl->kernelptr_f  = NULL;
 +
 +    elec     = gmx_nbkernel_elec_names[nl->ielec];
 +    elec_mod = eintmod_names[nl->ielecmod];
 +    vdw      = gmx_nbkernel_vdw_names[nl->ivdw];
 +    vdw_mod  = eintmod_names[nl->ivdwmod];
 +    geom     = gmx_nblist_geometry_names[nl->igeometry];
 +
 +    if (nl->type == GMX_NBLIST_INTERACTION_ADRESS)
 +    {
 +        nl->kernelptr_vf       = (void *) gmx_nb_generic_adress_kernel;
 +        nl->kernelptr_f        = (void *) gmx_nb_generic_adress_kernel;
 +        nl->simd_padding_width = 1;
 +        return;
 +    }
 +
 +    if (nl->type == GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +    {
 +        nl->kernelptr_vf       = (void *) gmx_nb_free_energy_kernel;
 +        nl->kernelptr_f        = (void *) gmx_nb_free_energy_kernel;
 +        nl->simd_padding_width = 1;
 +    }
 +    else if (!gmx_strcasecmp_min(geom, "CG-CG"))
 +    {
 +        nl->kernelptr_vf       = (void *) gmx_nb_generic_cg_kernel;
 +        nl->kernelptr_f        = (void *) gmx_nb_generic_cg_kernel;
 +        nl->simd_padding_width = 1;
 +    }
 +    else
 +    {
 +        /* Try to find a specific kernel first */
 +
 +        for (i = 0; i < narch && nl->kernelptr_vf == NULL; i++)
 +        {
 +            nl->kernelptr_vf       = (void *) nb_kernel_list_findkernel(log, arch_and_padding[i].arch, elec, elec_mod, vdw, vdw_mod, geom, other, "PotentialAndForce");
 +            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +        }
 +        for (i = 0; i < narch && nl->kernelptr_f == NULL; i++)
 +        {
 +            nl->kernelptr_f        = (void *) nb_kernel_list_findkernel(log, arch_and_padding[i].arch, elec, elec_mod, vdw, vdw_mod, geom, other, "Force");
 +            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +
 +            /* If there is not force-only optimized kernel, is there a potential & force one? */
 +            if (nl->kernelptr_f == NULL)
 +            {
 +                nl->kernelptr_f        = (void *) nb_kernel_list_findkernel(NULL, arch_and_padding[i].arch, elec, elec_mod, vdw, vdw_mod, geom, other, "PotentialAndForce");
 +                nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +            }
 +        }
 +
-                     (*kernelptr)(&(nlist[i]), x, f, fr, mdatoms, &kernel_data, nrnb);
++        /* Give up. If this was a water kernel, leave the pointer as NULL, which
++         * will disable water optimization in NS. If it is a particle kernel, set
++         * the pointer to the generic NB kernel.
++         */
++        if (nl->kernelptr_vf == NULL && !gmx_strcasecmp_min(geom,"Particle-Particle"))
 +        {
 +            nl->kernelptr_vf       = (void *) gmx_nb_generic_kernel;
 +            nl->kernelptr_f        = (void *) gmx_nb_generic_kernel;
 +            nl->simd_padding_width = 1;
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "WARNING - Slow generic NB kernel used for neighborlist with\n"
 +                        "    Elec: '%s', Modifier: '%s'\n"
 +                        "    Vdw:  '%s', Modifier: '%s'\n"
 +                        "    Geom: '%s', Other: '%s'\n\n",
 +                        elec, elec_mod, vdw, vdw_mod, geom, other);
 +            }
 +        }
 +    }
 +
 +    return;
 +}
 +
 +void do_nonbonded(t_forcerec *fr,
 +                  rvec x[], rvec f_shortrange[], rvec f_longrange[], t_mdatoms *mdatoms, t_blocka *excl,
 +                  gmx_grppairener_t *grppener,
 +                  t_nrnb *nrnb, real *lambda, real *dvdl,
 +                  int nls, int eNL, int flags)
 +{
 +    t_nblist *        nlist;
 +    int               n, n0, n1, i, i0, i1, sz, range;
 +    t_nblists *       nblists;
 +    nb_kernel_data_t  kernel_data;
 +    nb_kernel_t *     kernelptr = NULL;
 +    rvec *            f;
 +
 +    kernel_data.flags                   = flags;
 +    kernel_data.exclusions              = excl;
 +    kernel_data.lambda                  = lambda;
 +    kernel_data.dvdl                    = dvdl;
 +
 +    if (fr->bAllvsAll)
 +    {
 +        gmx_incons("All-vs-all kernels have not been implemented in version 4.6");
 +        return;
 +    }
 +
 +    if (eNL >= 0)
 +    {
 +        i0 = eNL;
 +        i1 = i0+1;
 +    }
 +    else
 +    {
 +        i0 = 0;
 +        i1 = eNL_NR;
 +    }
 +
 +    if (nls >= 0)
 +    {
 +        n0 = nls;
 +        n1 = nls+1;
 +    }
 +    else
 +    {
 +        n0 = 0;
 +        n1 = fr->nnblists;
 +    }
 +
 +    for (n = n0; (n < n1); n++)
 +    {
 +        nblists = &fr->nblists[n];
 +
 +        kernel_data.table_elec              = &nblists->table_elec;
 +        kernel_data.table_vdw               = &nblists->table_vdw;
 +        kernel_data.table_elec_vdw          = &nblists->table_elec_vdw;
 +
 +        for (range = 0; range < 2; range++)
 +        {
 +            /* Are we doing short/long-range? */
 +            if (range == 0)
 +            {
 +                /* Short-range */
 +                if (!(flags & GMX_NONBONDED_DO_SR))
 +                {
 +                    continue;
 +                }
 +                kernel_data.energygrp_elec          = grppener->ener[egCOULSR];
 +                kernel_data.energygrp_vdw           = grppener->ener[fr->bBHAM ? egBHAMSR : egLJSR];
 +                kernel_data.energygrp_polarization  = grppener->ener[egGB];
 +                nlist = nblists->nlist_sr;
 +                f                                   = f_shortrange;
 +            }
 +            else if (range == 1)
 +            {
 +                /* Long-range */
 +                if (!(flags & GMX_NONBONDED_DO_LR))
 +                {
 +                    continue;
 +                }
 +                kernel_data.energygrp_elec          = grppener->ener[egCOULLR];
 +                kernel_data.energygrp_vdw           = grppener->ener[fr->bBHAM ? egBHAMLR : egLJLR];
 +                kernel_data.energygrp_polarization  = grppener->ener[egGB];
 +                nlist = nblists->nlist_lr;
 +                f                                   = f_longrange;
 +            }
 +
 +            for (i = i0; (i < i1); i++)
 +            {
 +                if (nlist[i].nri > 0)
 +                {
 +                    if (flags & GMX_NONBONDED_DO_POTENTIAL)
 +                    {
 +                        /* Potential and force */
 +                        kernelptr = (nb_kernel_t *)nlist[i].kernelptr_vf;
 +                    }
 +                    else
 +                    {
 +                        /* Force only, no potential */
 +                        kernelptr = (nb_kernel_t *)nlist[i].kernelptr_f;
 +                    }
 +
 +                    if (nlist[i].type != GMX_NBLIST_INTERACTION_FREE_ENERGY && (flags & GMX_NONBONDED_DO_FOREIGNLAMBDA))
 +                    {
 +                        /* We don't need the non-perturbed interactions */
 +                        continue;
 +                    }
++                    /* Neighborlists whose kernelptr==NULL will always be empty */
++                    if(kernelptr != NULL)
++                    {
++                        (*kernelptr)(&(nlist[i]), x, f, fr, mdatoms, &kernel_data, nrnb);
++                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nb_listed_warning_rlimit(const rvec *x, int ai, int aj, int * global_atom_index, real r, real rlimit)
 +{
 +    gmx_warning("Listed nonbonded interaction between particles %d and %d\n"
 +                "at distance %.3f which is larger than the table limit %.3f nm.\n\n"
 +                "This is likely either a 1,4 interaction, or a listed interaction inside\n"
 +                "a smaller molecule you are decoupling during a free energy calculation.\n"
 +                "Since interactions at distances beyond the table cannot be computed,\n"
 +                "they are skipped until they are inside the table limit again. You will\n"
 +                "only see this message once, even if it occurs for several interactions.\n\n"
 +                "IMPORTANT: This should not happen in a stable simulation, so there is\n"
 +                "probably something wrong with your system. Only change the table-extension\n"
 +                "distance in the mdp file if you are really sure that is the reason.\n",
 +                glatnr(global_atom_index, ai), glatnr(global_atom_index, aj), r, rlimit);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "%8f %8f %8f\n%8f %8f %8f\n1-4 (%d,%d) interaction not within cut-off! r=%g. Ignored\n",
 +                x[ai][XX], x[ai][YY], x[ai][ZZ], x[aj][XX], x[aj][YY], x[aj][ZZ],
 +                glatnr(global_atom_index, ai), glatnr(global_atom_index, aj), r);
 +    }
 +}
 +
 +
 +
 +/* This might logically belong better in the nb_generic.c module, but it is only
 + * used in do_nonbonded_listed(), and we want it to be inlined there to avoid an
 + * extra functional call for every single pair listed in the topology.
 + */
 +static real
 +nb_evaluate_single(real r2, real tabscale, real *vftab,
 +                   real qq, real c6, real c12, real *velec, real *vvdw)
 +{
 +    real       rinv, r, rtab, eps, eps2, Y, F, Geps, Heps2, Fp, VVe, FFe, VVd, FFd, VVr, FFr, fscal;
 +    int        ntab;
 +
 +    /* Do the tabulated interactions - first table lookup */
 +    rinv             = gmx_invsqrt(r2);
 +    r                = r2*rinv;
 +    rtab             = r*tabscale;
 +    ntab             = rtab;
 +    eps              = rtab-ntab;
 +    eps2             = eps*eps;
 +    ntab             = 12*ntab;
 +    /* Electrostatics */
 +    Y                = vftab[ntab];
 +    F                = vftab[ntab+1];
 +    Geps             = eps*vftab[ntab+2];
 +    Heps2            = eps2*vftab[ntab+3];
 +    Fp               = F+Geps+Heps2;
 +    VVe              = Y+eps*Fp;
 +    FFe              = Fp+Geps+2.0*Heps2;
 +    /* Dispersion */
 +    Y                = vftab[ntab+4];
 +    F                = vftab[ntab+5];
 +    Geps             = eps*vftab[ntab+6];
 +    Heps2            = eps2*vftab[ntab+7];
 +    Fp               = F+Geps+Heps2;
 +    VVd              = Y+eps*Fp;
 +    FFd              = Fp+Geps+2.0*Heps2;
 +    /* Repulsion */
 +    Y                = vftab[ntab+8];
 +    F                = vftab[ntab+9];
 +    Geps             = eps*vftab[ntab+10];
 +    Heps2            = eps2*vftab[ntab+11];
 +    Fp               = F+Geps+Heps2;
 +    VVr              = Y+eps*Fp;
 +    FFr              = Fp+Geps+2.0*Heps2;
 +
 +    *velec           = qq*VVe;
 +    *vvdw            = c6*VVd+c12*VVr;
 +
 +    fscal            = -(qq*FFe+c6*FFd+c12*FFr)*tabscale*rinv;
 +
 +    return fscal;
 +}
 +
 +
 +real
 +do_nonbonded_listed(int ftype, int nbonds,
 +                    const t_iatom iatoms[], const t_iparams iparams[],
 +                    const rvec x[], rvec f[], rvec fshift[],
 +                    const t_pbc *pbc, const t_graph *g,
 +                    real *lambda, real *dvdl,
 +                    const t_mdatoms *md,
 +                    const t_forcerec *fr, gmx_grppairener_t *grppener,
 +                    int *global_atom_index)
 +{
 +    int              ielec, ivdw;
 +    real             qq, c6, c12;
 +    rvec             dx;
 +    ivec             dt;
 +    int              i, j, itype, ai, aj, gid;
 +    int              fshift_index;
 +    real             r2, rinv;
 +    real             fscal, velec, vvdw;
 +    real *           energygrp_elec;
 +    real *           energygrp_vdw;
 +    static gmx_bool  warned_rlimit = FALSE;
 +    /* Free energy stuff */
 +    gmx_bool         bFreeEnergy;
 +    real             LFC[2], LFV[2], DLF[2], lfac_coul[2], lfac_vdw[2], dlfac_coul[2], dlfac_vdw[2];
 +    real             qqB, c6B, c12B, sigma2_def, sigma2_min;
 +
 +
 +    switch (ftype)
 +    {
 +        case F_LJ14:
 +        case F_LJC14_Q:
 +            energygrp_elec = grppener->ener[egCOUL14];
 +            energygrp_vdw  = grppener->ener[egLJ14];
 +            break;
 +        case F_LJC_PAIRS_NB:
 +            energygrp_elec = grppener->ener[egCOULSR];
 +            energygrp_vdw  = grppener->ener[egLJSR];
 +            break;
 +        default:
 +            energygrp_elec = NULL; /* Keep compiler happy */
 +            energygrp_vdw  = NULL; /* Keep compiler happy */
 +            gmx_fatal(FARGS, "Unknown function type %d in do_nonbonded14", ftype);
 +            break;
 +    }
 +
 +    if (fr->efep != efepNO)
 +    {
 +        /* Lambda factor for state A=1-lambda and B=lambda */
 +        LFC[0] = 1.0 - lambda[efptCOUL];
 +        LFV[0] = 1.0 - lambda[efptVDW];
 +        LFC[1] = lambda[efptCOUL];
 +        LFV[1] = lambda[efptVDW];
 +
 +        /*derivative of the lambda factor for state A and B */
 +        DLF[0] = -1;
 +        DLF[1] = 1;
 +
 +        /* precalculate */
 +        sigma2_def = pow(fr->sc_sigma6_def, 1.0/3.0);
 +        sigma2_min = pow(fr->sc_sigma6_min, 1.0/3.0);
 +
 +        for (i = 0; i < 2; i++)
 +        {
 +            lfac_coul[i]  = (fr->sc_power == 2 ? (1-LFC[i])*(1-LFC[i]) : (1-LFC[i]));
 +            dlfac_coul[i] = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power == 2 ? (1-LFC[i]) : 1);
 +            lfac_vdw[i]   = (fr->sc_power == 2 ? (1-LFV[i])*(1-LFV[i]) : (1-LFV[i]));
 +            dlfac_vdw[i]  = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power == 2 ? (1-LFV[i]) : 1);
 +        }
 +    }
 +    else
 +    {
 +        sigma2_min = sigma2_def = 0;
 +    }
 +
 +    bFreeEnergy = FALSE;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        itype = iatoms[i++];
 +        ai    = iatoms[i++];
 +        aj    = iatoms[i++];
 +        gid   = GID(md->cENER[ai], md->cENER[aj], md->nenergrp);
 +
 +        /* Get parameters */
 +        switch (ftype)
 +        {
 +            case F_LJ14:
 +                bFreeEnergy =
 +                    (fr->efep != efepNO &&
 +                     ((md->nPerturbed && (md->bPerturbed[ai] || md->bPerturbed[aj])) ||
 +                      iparams[itype].lj14.c6A != iparams[itype].lj14.c6B ||
 +                      iparams[itype].lj14.c12A != iparams[itype].lj14.c12B));
 +                qq               = md->chargeA[ai]*md->chargeA[aj]*fr->epsfac*fr->fudgeQQ;
 +                c6               = iparams[itype].lj14.c6A;
 +                c12              = iparams[itype].lj14.c12A;
 +                break;
 +            case F_LJC14_Q:
 +                qq               = iparams[itype].ljc14.qi*iparams[itype].ljc14.qj*fr->epsfac*iparams[itype].ljc14.fqq;
 +                c6               = iparams[itype].ljc14.c6;
 +                c12              = iparams[itype].ljc14.c12;
 +                break;
 +            case F_LJC_PAIRS_NB:
 +                qq               = iparams[itype].ljcnb.qi*iparams[itype].ljcnb.qj*fr->epsfac;
 +                c6               = iparams[itype].ljcnb.c6;
 +                c12              = iparams[itype].ljcnb.c12;
 +                break;
 +            default:
 +                /* Cannot happen since we called gmx_fatal() above in this case */
 +                qq = c6 = c12 = 0; /* Keep compiler happy */
 +                break;
 +        }
 +
 +        /* To save flops in the optimized kernels, c6/c12 have 6.0/12.0 derivative prefactors
 +         * included in the general nfbp array now. This means the tables are scaled down by the
 +         * same factor, so when we use the original c6/c12 parameters from iparams[] they must
 +         * be scaled up.
 +         */
 +        c6  *= 6.0;
 +        c12 *= 12.0;
 +
 +        /* Do we need to apply full periodic boundary conditions? */
 +        if (fr->bMolPBC == TRUE)
 +        {
 +            fshift_index = pbc_dx_aiuc(pbc, x[ai], x[aj], dx);
 +        }
 +        else
 +        {
 +            fshift_index = CENTRAL;
 +            rvec_sub(x[ai], x[aj], dx);
 +        }
 +        r2           = norm2(dx);
 +
 +        if (r2 >= fr->tab14.r*fr->tab14.r)
 +        {
 +            if (warned_rlimit == FALSE)
 +            {
 +                nb_listed_warning_rlimit(x, ai, aj, global_atom_index, sqrt(r2), fr->tab14.r);
 +                warned_rlimit = TRUE;
 +            }
 +            continue;
 +        }
 +
 +        if (bFreeEnergy)
 +        {
 +            /* Currently free energy is only supported for F_LJ14, so no need to check for that if we got here */
 +            qqB              = md->chargeB[ai]*md->chargeB[aj]*fr->epsfac*fr->fudgeQQ;
 +            c6B              = iparams[itype].lj14.c6B*6.0;
 +            c12B             = iparams[itype].lj14.c12B*12.0;
 +
 +            fscal            = nb_free_energy_evaluate_single(r2, fr->sc_r_power, fr->sc_alphacoul, fr->sc_alphavdw,
 +                                                              fr->tab14.scale, fr->tab14.data, qq, c6, c12, qqB, c6B, c12B,
 +                                                              LFC, LFV, DLF, lfac_coul, lfac_vdw, dlfac_coul, dlfac_vdw,
 +                                                              fr->sc_sigma6_def, fr->sc_sigma6_min, sigma2_def, sigma2_min, &velec, &vvdw, dvdl);
 +        }
 +        else
 +        {
 +            /* Evaluate tabulated interaction without free energy */
 +            fscal            = nb_evaluate_single(r2, fr->tab14.scale, fr->tab14.data, qq, c6, c12, &velec, &vvdw);
 +        }
 +
 +        energygrp_elec[gid]  += velec;
 +        energygrp_vdw[gid]   += vvdw;
 +        svmul(fscal, dx, dx);
 +
 +        /* Add the forces */
 +        rvec_inc(f[ai], dx);
 +        rvec_dec(f[aj], dx);
 +
 +        if (g)
 +        {
 +            /* Correct the shift forces using the graph */
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            fshift_index = IVEC2IS(dt);
 +        }
 +        if (fshift_index != CENTRAL)
 +        {
 +            rvec_inc(fshift[fshift_index], dx);
 +            rvec_dec(fshift[CENTRAL], dx);
 +        }
 +    }
 +    return 0.0;
 +}
index fc82a2d455ba1a7e9ec7e49cf7e08f658a221404,0000000000000000000000000000000000000000..185e9c3ab1bdab605c97e7e426714f37fee16059
mode 100644,000000..100644
--- /dev/null
@@@ -1,9819 -1,0 +1,9844 @@@
-                                   gmx_bool bMaster, ivec npulse)
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmx_ga2la.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gpu_utils.h"
 +
 +#include "gromacs/fileio/futil.h"
 +#include "gromacs/fileio/gmxfio.h"
 +#include "gromacs/fileio/pdbio.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/swap/swapcoords.h"
 +#include "gromacs/utility/qsort_threadsafe.h"
 +#include "gromacs/pulling/pull.h"
 +#include "gromacs/pulling/pull_rotation.h"
 +#include "gromacs/imd/imd.h"
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +    int                nrank_gpu_shared;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +    MPI_Comm           mpi_comm_gpu_shared;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_int64_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
 +{
 +    int i;
 +    dd_bcast(dd, sizeof(int), &dfhist->bEquil);
 +    dd_bcast(dd, sizeof(int), &dfhist->nlambda);
 +    dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
 +
 +    if (dfhist->nlambda > 0)
 +    {
 +        int nlam = dfhist->nlambda;
 +        dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
 +
 +        for (i = 0; i < nlam; i++)
 +        {
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
 +        }
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        copy_df_history(&state_local->dfhist, &state->dfhist);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    /* communicate df_history -- required for restarting from checkpoint */
 +    dd_distribute_dfhist(dd, &state_local->dfhist);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], format[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
 +                                ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +    sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out, strlen(atomname) < 4 ? format : format4,
 +                "ATOM", (ii+1)%100000,
 +                atomname, resname, ' ', resnr%10000, ' ',
 +                10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +void get_pme_nnodes(const gmx_domdec_t *dd,
 +                    int *npmenodes_x, int *npmenodes_y)
 +{
 +    if (dd != NULL)
 +    {
 +        *npmenodes_x = dd->comm->npmenodes_x;
 +        *npmenodes_y = dd->comm->npmenodes_y;
 +    }
 +    else
 +    {
 +        *npmenodes_x = 1;
 +        *npmenodes_y = 1;
 +    }
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP nodes:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +    /* zone_ncg1[0] should always be equal to ncg_home */
 +    dd->comm->zone_ncg1[0] = dd->ncg_home;
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        /* The cut-off might have changed, e.g. by PME load balacning,
 +         * from the value used to set comm->cellsize_min, so check it.
 +         */
 +        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            /* Check for the cut-off limit set by the PME load balancing */
 +            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_int64_t     step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other sources,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +
 +#ifdef GMX_MPI
 +        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 +        {
 +            float gpu_wait, gpu_wait_sum;
 +
 +            gpu_wait = comm->cycl[ddCyclWaitGPU];
 +            if (comm->cycl_n[ddCyclF] > 1)
 +            {
 +                /* We should remove the WaitGPU time of the same MD step
 +                 * as the one with the maximum F time, since the F time
 +                 * and the wait time are not independent.
 +                 * Furthermore, the step for the max F time should be chosen
 +                 * the same on all ranks that share the same GPU.
 +                 * But to keep the code simple, we remove the average instead.
 +                 * The main reason for artificially long times at some steps
 +                 * is spurious CPU activity or MPI time, so we don't expect
 +                 * that changes in the GPU wait time matter a lot here.
 +                 */
 +                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
 +            }
 +            /* Sum the wait times over the ranks that share the same GPU */
 +            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 +                          comm->mpi_comm_gpu_shared);
 +            /* Replace the wait time by the average over the ranks */
 +            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 +        }
 +#endif
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
++enum { setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY };
++
++/* Set the domain boundaries. Use for static (or no) load balancing,
++ * and also for the starting state for dynamic load balancing.
++ * setmode determine if and where the boundaries are stored, use enum above.
++ * Returns the number communication pulses in npulse.
++ */
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
-             if (bMaster)
-             {
-                 for (j = 0; j < dd->nc[d]+1; j++)
-                 {
-                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
-                 }
-             }
-             else
++                                  int setmode, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
-                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
-                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
++            switch (setmode)
 +            {
-             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
++                case setcellsizeslbMASTER:
++                    for (j = 0; j < dd->nc[d]+1; j++)
++                    {
++                        dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
++                    }
++                    break;
++                case setcellsizeslbLOCAL:
++                    comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
++                    comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
++                    break;
++                default:
++                    break;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
-             if (bMaster)
++            while (cellsize*npulse[d] < comm->cutoff)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
-             if (!bMaster)
++            if (setmode == setcellsizeslbMASTER)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
-             gmx_fatal_collective(FARGS, NULL, dd,
-                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
-                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
-                                  comm->cutoff,
-                                  dd->nc[d], dd->nc[d],
-                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
++            if (setmode == setcellsizeslbLOCAL)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
++            }
++            if (setmode != setcellsizeslbMASTER)
++            {
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
-         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
++            char error_string[STRLEN];
++
++            sprintf(error_string,
++                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
++                    dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
++                    comm->cutoff,
++                    dd->nc[d], dd->nc[d],
++                    dd->nnodes > dd->nc[d] ? "cells" : "processors");
++
++            if (setmode == setcellsizeslbLOCAL)
++            {
++                gmx_fatal_collective(FARGS, NULL, dd, error_string);
++            }
++            else
++            {
++                gmx_fatal(FARGS, error_string);
++            }
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_int64_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_int64_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 != d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
-         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
++        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_int64_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_int64_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
++
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
-         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
++        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_int64_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_int64_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_int64_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
 +                                   const gmx_hw_info_t gmx_unused *hwinfo,
 +                                   const gmx_hw_opt_t  gmx_unused *hw_opt)
 +{
 +#ifdef GMX_MPI
 +    int           physicalnode_id_hash;
 +    int           gpu_id;
 +    gmx_domdec_t *dd;
 +    MPI_Comm      mpi_comm_pp_physicalnode;
 +
 +    if (!(cr->duty & DUTY_PP) ||
 +        hw_opt->gpu_opt.ncuda_dev_use == 0)
 +    {
 +        /* Only PP nodes (currently) use GPUs.
 +         * If we don't have GPUs, there are no resources to share.
 +         */
 +        return;
 +    }
 +
 +    physicalnode_id_hash = gmx_physicalnode_id_hash();
 +
 +    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
 +
 +    dd = cr->dd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
 +        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
 +                dd->rank, physicalnode_id_hash, gpu_id);
 +    }
 +    /* Split the PP communicator over the physical nodes */
 +    /* TODO: See if we should store this (before), as it's also used for
 +     * for the nodecomm summution.
 +     */
 +    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
 +                   &mpi_comm_pp_physicalnode);
 +    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
 +                   &dd->comm->mpi_comm_gpu_shared);
 +    MPI_Comm_free(&mpi_comm_pp_physicalnode);
 +    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
 +    }
 +
 +    /* Note that some ranks could share a GPU, while others don't */
 +
 +    if (dd->comm->nrank_gpu_shared == 1)
 +    {
 +        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
 +    }
 +#endif
 +}
 +
 +static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
 +                               int gmx_unused reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the nodes: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME nodes\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_getenv(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
 +    comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
 +    recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
 +    comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
 +    comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    /* Initialize to GPU share count to 0, might change later */
 +    comm->nrank_gpu_shared = 0;
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
++        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            assert(ddbox->npbcdim <= DIM);
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_int64_t          step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_int64_t        step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
 +    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +        ncgindex_set = 0;
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
 +        ncgindex_set = dd->ncg_home;
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
 +        ncgindex_set = 0;
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr,
 +                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        ga2la_clear(dd->ga2la);
 +        ncgindex_set = 0;
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
 +    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
 +    setup_bonded_threading(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges and/or c6/sigmas to our PME only node */
 +        gmx_pme_send_parameters(cr, mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
 +                                mdatoms->chargeA, mdatoms->chargeB,
 +                                mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
 +                                mdatoms->sigmaA, mdatoms->sigmaB,
 +                                dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +    if (ir->eSwapCoords != eswapNO)
 +    {
 +        /* Update the local groups needed for ion swapping */
 +        dd_make_local_swap_groups(dd, ir->swap);
 +    }
 +
 +    /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
 +    dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index d8001ea9dabaaa52c91f565e5daecc8c73a5628e,0000000000000000000000000000000000000000..5da2d3a58ee71a39beec0880a9a561e06af0348a
mode 100644,000000..100644
--- /dev/null
@@@ -1,784 -1,0 +1,789 @@@
-         /* Without PBC there are no cell size limits with 2 cells */
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <assert.h>
 +#include "domdec.h"
 +#include "types/commrec.h"
 +#include "network.h"
 +#include "perf_est.h"
 +#include "physics.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "names.h"
 +
 +/* Margin for setting up the DD grid */
 +#define DD_GRID_MARGIN_PRES_SCALE 1.05
 +
 +static int factorize(int n, int **fac, int **mfac)
 +{
 +    int d, ndiv;
 +
 +    if (n <= 0)
 +    {
 +        gmx_fatal(FARGS, "Can only factorize positive integers.");
 +    }
 +
 +    /* Decompose n in factors */
 +    snew(*fac, n/2);
 +    snew(*mfac, n/2);
 +    d    = 2;
 +    ndiv = 0;
 +    while (n > 1)
 +    {
 +        while (n % d == 0)
 +        {
 +            if (ndiv == 0 || (*fac)[ndiv-1] != d)
 +            {
 +                ndiv++;
 +                (*fac)[ndiv-1] = d;
 +            }
 +            (*mfac)[ndiv-1]++;
 +            n /= d;
 +        }
 +        d++;
 +    }
 +
 +    return ndiv;
 +}
 +
 +static gmx_bool largest_divisor(int n)
 +{
 +    int ndiv, *div, *mdiv, ldiv;
 +
 +    ndiv = factorize(n, &div, &mdiv);
 +    ldiv = div[ndiv-1];
 +    sfree(div);
 +    sfree(mdiv);
 +
 +    return ldiv;
 +}
 +
 +static int lcd(int n1, int n2)
 +{
 +    int d, i;
 +
 +    d = 1;
 +    for (i = 2; (i <= n1 && i <= n2); i++)
 +    {
 +        if (n1 % i == 0 && n2 % i == 0)
 +        {
 +            d = i;
 +        }
 +    }
 +
 +    return d;
 +}
 +
 +static gmx_bool fits_pme_ratio(int nnodes, int npme, float ratio)
 +{
 +    return ((double)npme/(double)nnodes > 0.95*ratio);
 +}
 +
 +static gmx_bool fits_pp_pme_perf(int nnodes, int npme, float ratio)
 +{
 +    int ndiv, *div, *mdiv, ldiv;
 +    int npp_root3, npme_root2;
 +
 +    ndiv = factorize(nnodes-npme, &div, &mdiv);
 +    ldiv = div[ndiv-1];
 +    sfree(div);
 +    sfree(mdiv);
 +
 +    npp_root3  = (int)(pow(nnodes-npme, 1.0/3.0) + 0.5);
 +    npme_root2 = (int)(sqrt(npme) + 0.5);
 +
 +    /* The check below gives a reasonable division:
 +     * factor 5 allowed at 5 or more PP nodes,
 +     * factor 7 allowed at 49 or more PP nodes.
 +     */
 +    if (ldiv > 3 + npp_root3)
 +    {
 +        return FALSE;
 +    }
 +
 +    /* Check if the number of PP and PME nodes have a reasonable sized
 +     * denominator in common, such that we can use 2D PME decomposition
 +     * when required (which requires nx_pp == nx_pme).
 +     * The factor of 2 allows for a maximum ratio of 2^2=4
 +     * between nx_pme and ny_pme.
 +     */
 +    if (lcd(nnodes-npme, npme)*2 < npme_root2)
 +    {
 +        return FALSE;
 +    }
 +
 +    /* Does this division gives a reasonable PME load? */
 +    return fits_pme_ratio(nnodes, npme, ratio);
 +}
 +
 +static int guess_npme(FILE *fplog, gmx_mtop_t *mtop, t_inputrec *ir, matrix box,
 +                      int nnodes)
 +{
 +    float      ratio;
 +    int        npme, nkx, nky;
 +    t_inputrec ir_try;
 +
 +    ratio = pme_load_estimate(mtop, ir, box);
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Guess for relative PME load: %.2f\n", ratio);
 +    }
 +
 +    /* We assume the optimal node ratio is close to the load ratio.
 +     * The communication load is neglected,
 +     * but (hopefully) this will balance out between PP and PME.
 +     */
 +
 +    if (!fits_pme_ratio(nnodes, nnodes/2, ratio))
 +    {
 +        /* We would need more than nnodes/2 PME only nodes,
 +         * which is not possible. Since the PME load is very high,
 +         * we will not loose much performance when all nodes do PME.
 +         */
 +
 +        return 0;
 +    }
 +
 +    /* First try to find npme as a factor of nnodes up to nnodes/3.
 +     * We start with a minimum PME node fraction of 1/16
 +     * and avoid ratios which lead to large prime factors in nnodes-npme.
 +     */
 +    npme = (nnodes + 15)/16;
 +    while (npme <= nnodes/3)
 +    {
 +        if (nnodes % npme == 0)
 +        {
 +            /* Note that fits_perf might change the PME grid,
 +             * in the current implementation it does not.
 +             */
 +            if (fits_pp_pme_perf(nnodes, npme, ratio))
 +            {
 +                break;
 +            }
 +        }
 +        npme++;
 +    }
 +    if (npme > nnodes/3)
 +    {
 +        /* Try any possible number for npme */
 +        npme = 1;
 +        while (npme <= nnodes/2)
 +        {
 +            /* Note that fits_perf may change the PME grid */
 +            if (fits_pp_pme_perf(nnodes, npme, ratio))
 +            {
 +                break;
 +            }
 +            npme++;
 +        }
 +    }
 +    if (npme > nnodes/2)
 +    {
 +        gmx_fatal(FARGS, "Could not find an appropriate number of separate PME nodes. i.e. >= %5f*#nodes (%d) and <= #nodes/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
 +                  "Use the -npme option of mdrun or change the number of processors or the PME grid dimensions, see the manual for details.",
 +                  ratio, (int)(0.95*ratio*nnodes+0.5), nnodes/2, ir->nkx, ir->nky);
 +        /* Keep the compiler happy */
 +        npme = 0;
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Will use %d particle-particle and %d PME only nodes\n"
 +                    "This is a guess, check the performance at the end of the log file\n",
 +                    nnodes-npme, npme);
 +        }
 +        fprintf(stderr, "\n"
 +                "Will use %d particle-particle and %d PME only nodes\n"
 +                "This is a guess, check the performance at the end of the log file\n",
 +                nnodes-npme, npme);
 +    }
 +
 +    return npme;
 +}
 +
 +static int div_up(int n, int f)
 +{
 +    return (n + f - 1)/f;
 +}
 +
 +real comm_box_frac(ivec dd_nc, real cutoff, gmx_ddbox_t *ddbox)
 +{
 +    int  i, j, k, npp;
 +    rvec bt, nw;
 +    real comm_vol;
 +
 +    for (i = 0; i < DIM; i++)
 +    {
 +        bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i];
 +        nw[i] = dd_nc[i]*cutoff/bt[i];
 +    }
 +
 +    npp      = 1;
 +    comm_vol = 0;
 +    for (i = 0; i < DIM; i++)
 +    {
 +        if (dd_nc[i] > 1)
 +        {
 +            npp      *= dd_nc[i];
 +            comm_vol += nw[i];
 +            for (j = i+1; j < DIM; j++)
 +            {
 +                if (dd_nc[j] > 1)
 +                {
 +                    comm_vol += nw[i]*nw[j]*M_PI/4;
 +                    for (k = j+1; k < DIM; k++)
 +                    {
 +                        if (dd_nc[k] > 1)
 +                        {
 +                            comm_vol += nw[i]*nw[j]*nw[k]*M_PI/6;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return comm_vol;
 +}
 +
 +static gmx_bool inhomogeneous_z(const t_inputrec *ir)
 +{
 +    return ((EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) &&
 +            ir->ePBC == epbcXYZ && ir->ewald_geometry == eewg3DC);
 +}
 +
 +/* Avoid integer overflows */
 +static float comm_pme_cost_vol(int npme, int a, int b, int c)
 +{
 +    float comm_vol;
 +
 +    comm_vol  = npme - 1;
 +    comm_vol *= npme;
 +    comm_vol *= div_up(a, npme);
 +    comm_vol *= div_up(b, npme);
 +    comm_vol *= c;
 +    return comm_vol;
 +}
 +
 +static float comm_cost_est(real limit, real cutoff,
 +                           matrix box, gmx_ddbox_t *ddbox,
 +                           int natoms, t_inputrec *ir,
 +                           float pbcdxr,
 +                           int npme_tot, ivec nc)
 +{
 +    ivec  npme = {1, 1, 1};
 +    int   i, j, k, nk, overlap;
 +    rvec  bt;
 +    float comm_vol, comm_vol_xf, comm_pme, cost_pbcdx;
 +    /* This is the cost of a pbc_dx call relative to the cost
 +     * of communicating the coordinate and force of an atom.
 +     * This will be machine dependent.
 +     * These factors are for x86 with SMP or Infiniband.
 +     */
 +    float pbcdx_rect_fac = 0.1;
 +    float pbcdx_tric_fac = 0.2;
 +    float temp;
 +
 +    /* Check the DD algorithm restrictions */
 +    if ((ir->ePBC == epbcXY && ir->nwall < 2 && nc[ZZ] > 1) ||
 +        (ir->ePBC == epbcSCREW && (nc[XX] == 1 || nc[YY] > 1 || nc[ZZ] > 1)))
 +    {
 +        return -1;
 +    }
 +
 +    if (inhomogeneous_z(ir) && nc[ZZ] > 1)
 +    {
 +        return -1;
 +    }
 +
 +    assert(ddbox->npbcdim <= DIM);
 +
 +    /* Check if the triclinic requirements are met */
 +    for (i = 0; i < DIM; i++)
 +    {
 +        for (j = i+1; j < ddbox->npbcdim; j++)
 +        {
 +            if (box[j][i] != 0 || ir->deform[j][i] != 0 ||
 +                (ir->epc != epcNO && ir->compress[j][i] != 0))
 +            {
 +                if (nc[j] > 1 && nc[i] == 1)
 +                {
 +                    return -1;
 +                }
 +            }
 +        }
 +    }
 +
 +    for (i = 0; i < DIM; i++)
 +    {
 +        bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i];
 +
++        /* Without PBC and with 2 cells, there are no lower limits on the cell size */
 +        if (!(i >= ddbox->npbcdim && nc[i] <= 2) && bt[i] < nc[i]*limit)
 +        {
 +            return -1;
 +        }
++        /* With PBC, check if the cut-off fits in nc[i]-1 cells */
++        if (i < ddbox->npbcdim && nc[i] > 1 && (nc[i] - 1)*bt[i] < nc[i]*cutoff)
++        {
++            return -1;
++        }
 +    }
 +
 +    if (npme_tot > 1)
 +    {
 +        /* The following choices should match those
 +         * in init_domain_decomposition in domdec.c.
 +         */
 +        if (nc[XX] == 1 && nc[YY] > 1)
 +        {
 +            npme[XX] = 1;
 +            npme[YY] = npme_tot;
 +        }
 +        else if (nc[YY] == 1)
 +        {
 +            npme[XX] = npme_tot;
 +            npme[YY] = 1;
 +        }
 +        else
 +        {
 +            /* Will we use 1D or 2D PME decomposition? */
 +            npme[XX] = (npme_tot % nc[XX] == 0) ? nc[XX] : npme_tot;
 +            npme[YY] = npme_tot/npme[XX];
 +        }
 +    }
 +
 +    /* When two dimensions are (nearly) equal, use more cells
 +     * for the smallest index, so the decomposition does not
 +     * depend sensitively on the rounding of the box elements.
 +     */
 +    for (i = 0; i < DIM; i++)
 +    {
 +        for (j = i+1; j < DIM; j++)
 +        {
 +            /* Check if the box size is nearly identical,
 +             * in that case we prefer nx > ny  and ny > nz.
 +             */
 +            if (fabs(bt[j] - bt[i]) < 0.01*bt[i] && nc[j] > nc[i])
 +            {
 +                /* The XX/YY check is a bit compact. If nc[YY]==npme[YY]
 +                 * this means the swapped nc has nc[XX]==npme[XX],
 +                 * and we can also swap X and Y for PME.
 +                 */
 +                /* Check if dimension i and j are equivalent for PME.
 +                 * For x/y: if nc[YY]!=npme[YY], we can not swap x/y
 +                 * For y/z: we can not have PME decomposition in z
 +                 */
 +                if (npme_tot <= 1 ||
 +                    !((i == XX && j == YY && nc[YY] != npme[YY]) ||
 +                      (i == YY && j == ZZ && npme[YY] > 1)))
 +                {
 +                    return -1;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* This function determines only half of the communication cost.
 +     * All PP, PME and PP-PME communication is symmetric
 +     * and the "back"-communication cost is identical to the forward cost.
 +     */
 +
 +    comm_vol = comm_box_frac(nc, cutoff, ddbox);
 +
 +    comm_pme = 0;
 +    for (i = 0; i < 2; i++)
 +    {
 +        /* Determine the largest volume for PME x/f redistribution */
 +        if (nc[i] % npme[i] != 0)
 +        {
 +            if (nc[i] > npme[i])
 +            {
 +                comm_vol_xf = (npme[i] == 2 ? 1.0/3.0 : 0.5);
 +            }
 +            else
 +            {
 +                comm_vol_xf = 1.0 - lcd(nc[i], npme[i])/(double)npme[i];
 +            }
 +            comm_pme += 3*natoms*comm_vol_xf;
 +        }
 +
 +        /* Grid overlap communication */
 +        if (npme[i] > 1)
 +        {
 +            nk        = (i == 0 ? ir->nkx : ir->nky);
 +            overlap   = (nk % npme[i] == 0 ? ir->pme_order-1 : ir->pme_order);
 +            temp      = npme[i];
 +            temp     *= overlap;
 +            temp     *= ir->nkx;
 +            temp     *= ir->nky;
 +            temp     *= ir->nkz;
 +            temp     /= nk;
 +            comm_pme += temp;
 +/* Old line comm_pme += npme[i]*overlap*ir->nkx*ir->nky*ir->nkz/nk; */
 +        }
 +    }
 +
 +    /* PME FFT communication volume.
 +     * This only takes the communication into account and not imbalance
 +     * in the calculation. But the imbalance in communication and calculation
 +     * are similar and therefore these formulas also prefer load balance
 +     * in the FFT and pme_solve calculation.
 +     */
 +    comm_pme += comm_pme_cost_vol(npme[YY], ir->nky, ir->nkz, ir->nkx);
 +    comm_pme += comm_pme_cost_vol(npme[XX], ir->nkx, ir->nky, ir->nkz);
 +
 +    /* Add cost of pbc_dx for bondeds */
 +    cost_pbcdx = 0;
 +    if ((nc[XX] == 1 || nc[YY] == 1) || (nc[ZZ] == 1 && ir->ePBC != epbcXY))
 +    {
 +        if ((ddbox->tric_dir[XX] && nc[XX] == 1) ||
 +            (ddbox->tric_dir[YY] && nc[YY] == 1))
 +        {
 +            cost_pbcdx = pbcdxr*pbcdx_tric_fac;
 +        }
 +        else
 +        {
 +            cost_pbcdx = pbcdxr*pbcdx_rect_fac;
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
 +                nc[XX], nc[YY], nc[ZZ], npme[XX], npme[YY],
 +                comm_vol, cost_pbcdx, comm_pme,
 +                3*natoms*(comm_vol + cost_pbcdx) + comm_pme);
 +    }
 +
 +    return 3*natoms*(comm_vol + cost_pbcdx) + comm_pme;
 +}
 +
 +static void assign_factors(gmx_domdec_t *dd,
 +                           real limit, real cutoff,
 +                           matrix box, gmx_ddbox_t *ddbox,
 +                           int natoms, t_inputrec *ir,
 +                           float pbcdxr, int npme,
 +                           int ndiv, int *div, int *mdiv, ivec ir_try, ivec opt)
 +{
 +    int   x, y, z, i;
 +    float ce;
 +
 +    if (ndiv == 0)
 +    {
 +        ce = comm_cost_est(limit, cutoff, box, ddbox,
 +                           natoms, ir, pbcdxr, npme, ir_try);
 +        if (ce >= 0 && (opt[XX] == 0 ||
 +                        ce < comm_cost_est(limit, cutoff, box, ddbox,
 +                                           natoms, ir, pbcdxr,
 +                                           npme, opt)))
 +        {
 +            copy_ivec(ir_try, opt);
 +        }
 +
 +        return;
 +    }
 +
 +    for (x = mdiv[0]; x >= 0; x--)
 +    {
 +        for (i = 0; i < x; i++)
 +        {
 +            ir_try[XX] *= div[0];
 +        }
 +        for (y = mdiv[0]-x; y >= 0; y--)
 +        {
 +            for (i = 0; i < y; i++)
 +            {
 +                ir_try[YY] *= div[0];
 +            }
 +            for (i = 0; i < mdiv[0]-x-y; i++)
 +            {
 +                ir_try[ZZ] *= div[0];
 +            }
 +
 +            /* recurse */
 +            assign_factors(dd, limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme,
 +                           ndiv-1, div+1, mdiv+1, ir_try, opt);
 +
 +            for (i = 0; i < mdiv[0]-x-y; i++)
 +            {
 +                ir_try[ZZ] /= div[0];
 +            }
 +            for (i = 0; i < y; i++)
 +            {
 +                ir_try[YY] /= div[0];
 +            }
 +        }
 +        for (i = 0; i < x; i++)
 +        {
 +            ir_try[XX] /= div[0];
 +        }
 +    }
 +}
 +
 +static real optimize_ncells(FILE *fplog,
 +                            int nnodes_tot, int npme_only,
 +                            gmx_bool bDynLoadBal, real dlb_scale,
 +                            gmx_mtop_t *mtop, matrix box, gmx_ddbox_t *ddbox,
 +                            t_inputrec *ir,
 +                            gmx_domdec_t *dd,
 +                            real cellsize_limit, real cutoff,
 +                            gmx_bool bInterCGBondeds,
 +                            ivec nc)
 +{
 +    int      npp, npme, ndiv, *div, *mdiv, d, nmax;
 +    gmx_bool bExcl_pbcdx;
 +    float    pbcdxr;
 +    real     limit;
 +    ivec     itry;
 +
 +    limit  = cellsize_limit;
 +
 +    dd->nc[XX] = 1;
 +    dd->nc[YY] = 1;
 +    dd->nc[ZZ] = 1;
 +
 +    npp = nnodes_tot - npme_only;
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        npme = (npme_only > 0 ? npme_only : npp);
 +    }
 +    else
 +    {
 +        npme = 0;
 +    }
 +
 +    if (bInterCGBondeds)
 +    {
 +        /* For Ewald exclusions pbc_dx is not called */
 +        bExcl_pbcdx =
 +            (IR_EXCL_FORCES(*ir) && !EEL_FULL(ir->coulombtype));
 +        pbcdxr = (double)n_bonded_dx(mtop, bExcl_pbcdx)/(double)mtop->natoms;
 +    }
 +    else
 +    {
 +        /* Every molecule is a single charge group: no pbc required */
 +        pbcdxr = 0;
 +    }
 +    /* Add a margin for DLB and/or pressure scaling */
 +    if (bDynLoadBal)
 +    {
 +        if (dlb_scale >= 1.0)
 +        {
 +            gmx_fatal(FARGS, "The value for option -dds should be smaller than 1");
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Scaling the initial minimum size with 1/%g (option -dds) = %g\n", dlb_scale, 1/dlb_scale);
 +        }
 +        limit /= dlb_scale;
 +    }
 +    else if (ir->epc != epcNO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "To account for pressure scaling, scaling the initial minimum size with %g\n", DD_GRID_MARGIN_PRES_SCALE);
 +            limit *= DD_GRID_MARGIN_PRES_SCALE;
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm\n", npp, limit);
 +
 +        if (inhomogeneous_z(ir))
 +        {
 +            fprintf(fplog, "Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.\n", eewg_names[ir->ewald_geometry]);
 +        }
 +
 +        if (limit > 0)
 +        {
 +            fprintf(fplog, "The maximum allowed number of cells is:");
 +            for (d = 0; d < DIM; d++)
 +            {
 +                nmax = (int)(ddbox->box_size[d]*ddbox->skew_fac[d]/limit);
 +                if (d >= ddbox->npbcdim && nmax < 2)
 +                {
 +                    nmax = 2;
 +                }
 +                if (d == ZZ && inhomogeneous_z(ir))
 +                {
 +                    nmax = 1;
 +                }
 +                fprintf(fplog, " %c %d", 'X' + d, nmax);
 +            }
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Average nr of pbc_dx calls per atom %.2f\n", pbcdxr);
 +    }
 +
 +    /* Decompose npp in factors */
 +    ndiv = factorize(npp, &div, &mdiv);
 +
 +    itry[XX] = 1;
 +    itry[YY] = 1;
 +    itry[ZZ] = 1;
 +    clear_ivec(nc);
 +    assign_factors(dd, limit, cutoff, box, ddbox, mtop->natoms, ir, pbcdxr,
 +                   npme, ndiv, div, mdiv, itry, nc);
 +
 +    sfree(div);
 +    sfree(mdiv);
 +
 +    return limit;
 +}
 +
 +real dd_choose_grid(FILE *fplog,
 +                    t_commrec *cr, gmx_domdec_t *dd, t_inputrec *ir,
 +                    gmx_mtop_t *mtop, matrix box, gmx_ddbox_t *ddbox,
 +                    gmx_bool bDynLoadBal, real dlb_scale,
 +                    real cellsize_limit, real cutoff_dd,
 +                    gmx_bool bInterCGBondeds)
 +{
 +    gmx_int64_t     nnodes_div, ldiv;
 +    real            limit;
 +
 +    if (MASTER(cr))
 +    {
 +        nnodes_div = cr->nnodes;
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (cr->npmenodes > 0)
 +            {
 +                if (cr->nnodes <= 2)
 +                {
 +                    gmx_fatal(FARGS,
 +                              "Can not have separate PME nodes with 2 or less nodes");
 +                }
 +                if (cr->npmenodes >= cr->nnodes)
 +                {
 +                    gmx_fatal(FARGS,
 +                              "Can not have %d separate PME nodes with just %d total nodes",
 +                              cr->npmenodes, cr->nnodes);
 +                }
 +
 +                /* If the user purposely selected the number of PME nodes,
 +                 * only check for large primes in the PP node count.
 +                 */
 +                nnodes_div -= cr->npmenodes;
 +            }
 +        }
 +        else
 +        {
 +            cr->npmenodes = 0;
 +        }
 +
 +        if (cr->nnodes > 12)
 +        {
 +            ldiv = largest_divisor(nnodes_div);
 +            /* Check if the largest divisor is more than nnodes^2/3 */
 +            if (ldiv*ldiv*ldiv > nnodes_div*nnodes_div)
 +            {
 +                gmx_fatal(FARGS, "The number of nodes you selected (%d) contains a large prime factor %d. In most cases this will lead to bad performance. Choose a number with smaller prime factors or set the decomposition (option -dd) manually.",
 +                          nnodes_div, ldiv);
 +            }
 +        }
 +
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (cr->npmenodes < 0)
 +            {
 +                /* Use PME nodes when the number of nodes is more than 16 */
 +                if (cr->nnodes <= 18)
 +                {
 +                    cr->npmenodes = 0;
 +                    if (fplog)
 +                    {
 +                        fprintf(fplog, "Using %d separate PME nodes, as there are too few total\n nodes for efficient splitting\n", cr->npmenodes);
 +                    }
 +                }
 +                else
 +                {
 +                    cr->npmenodes = guess_npme(fplog, mtop, ir, box, cr->nnodes);
 +                    if (fplog)
 +                    {
 +                        fprintf(fplog, "Using %d separate PME nodes, as guessed by mdrun\n", cr->npmenodes);
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Using %d separate PME nodes, per user request\n", cr->npmenodes);
 +                }
 +            }
 +        }
 +
 +        limit = optimize_ncells(fplog, cr->nnodes, cr->npmenodes,
 +                                bDynLoadBal, dlb_scale,
 +                                mtop, box, ddbox, ir, dd,
 +                                cellsize_limit, cutoff_dd,
 +                                bInterCGBondeds,
 +                                dd->nc);
 +    }
 +    else
 +    {
 +        limit = 0;
 +    }
 +    /* Communicate the information set by the master to all nodes */
 +    gmx_bcast(sizeof(dd->nc), dd->nc, cr);
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        gmx_bcast(sizeof(ir->nkx), &ir->nkx, cr);
 +        gmx_bcast(sizeof(ir->nky), &ir->nky, cr);
 +        gmx_bcast(sizeof(cr->npmenodes), &cr->npmenodes, cr);
 +    }
 +    else
 +    {
 +        cr->npmenodes = 0;
 +    }
 +
 +    return limit;
 +}
index 6558e2be9cfe51e3347fb4c7b8f180169c34144f,0000000000000000000000000000000000000000..5208983deccab7cc9b8e44c25ab7ba293a59f4cb
mode 100644,000000..100644
--- /dev/null
@@@ -1,2975 -1,0 +1,2978 @@@
-             fprintf(log, "Note: The available nonbonded kernels do not support water optimization - disabling.\n");
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "gromacs/utility/smalloc.h"
 +#include "macros.h"
 +#include "gromacs/math/utilities.h"
 +#include "vec.h"
 +#include "types/commrec.h"
 +#include "network.h"
 +#include "nsgrid.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "ns.h"
 +#include "pbc.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "nrnb.h"
 +#include "txtdump.h"
 +#include "mtop_util.h"
 +
 +#include "domdec.h"
 +#include "adress.h"
 +
 +
 +/*
 + *    E X C L U S I O N   H A N D L I N G
 + */
 +
 +#ifdef DEBUG
 +static void SETEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    e[j] = e[j] | (1<<i);
 +}
 +static void RMEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    e[j] = e[j] & ~(1<<i);
 +}
 +static gmx_bool ISEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    return (gmx_bool)(e[j] & (1<<i));
 +}
 +static gmx_bool NOTEXCL_(t_excl e[], atom_id i, atom_id j)
 +{
 +    return !(ISEXCL(e, i, j));
 +}
 +#else
 +#define SETEXCL(e, i, j) (e)[((atom_id) (j))] |= (1<<((atom_id) (i)))
 +#define RMEXCL(e, i, j)  (e)[((atom_id) (j))] &= (~(1<<((atom_id) (i))))
 +#define ISEXCL(e, i, j)  (gmx_bool) ((e)[((atom_id) (j))] & (1<<((atom_id) (i))))
 +#define NOTEXCL(e, i, j) !(ISEXCL(e, i, j))
 +#endif
 +
 +static int
 +round_up_to_simd_width(int length, int simd_width)
 +{
 +    int offset, newlength;
 +
 +    offset = (simd_width > 0) ? length % simd_width : 0;
 +
 +    return (offset == 0) ? length : length-offset+simd_width;
 +}
 +/************************************************
 + *
 + *  U T I L I T I E S    F O R    N S
 + *
 + ************************************************/
 +
 +void reallocate_nblist(t_nblist *nl)
 +{
 +    if (gmx_debug_at)
 +    {
 +        fprintf(debug, "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
 +                nl->ielec, nl->ivdw, nl->igeometry, nl->type, nl->maxnri);
 +    }
 +    srenew(nl->iinr,   nl->maxnri);
 +    if (nl->igeometry == GMX_NBLIST_GEOMETRY_CG_CG)
 +    {
 +        srenew(nl->iinr_end, nl->maxnri);
 +    }
 +    srenew(nl->gid,    nl->maxnri);
 +    srenew(nl->shift,  nl->maxnri);
 +    srenew(nl->jindex, nl->maxnri+1);
 +}
 +
 +
 +static void init_nblist(FILE *log, t_nblist *nl_sr, t_nblist *nl_lr,
 +                        int maxsr, int maxlr,
 +                        int ivdw, int ivdwmod,
 +                        int ielec, int ielecmod,
 +                        int igeometry, int type)
 +{
 +    t_nblist *nl;
 +    int       homenr;
 +    int       i, nn;
 +
 +    for (i = 0; (i < 2); i++)
 +    {
 +        nl     = (i == 0) ? nl_sr : nl_lr;
 +        homenr = (i == 0) ? maxsr : maxlr;
 +
 +        if (nl == NULL)
 +        {
 +            continue;
 +        }
 +
 +
 +        /* Set coul/vdw in neighborlist, and for the normal loops we determine
 +         * an index of which one to call.
 +         */
 +        nl->ivdw        = ivdw;
 +        nl->ivdwmod     = ivdwmod;
 +        nl->ielec       = ielec;
 +        nl->ielecmod    = ielecmod;
 +        nl->type        = type;
 +        nl->igeometry   = igeometry;
 +
 +        if (nl->type == GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +        {
 +            nl->igeometry  = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +        }
 +
 +        /* This will also set the simd_padding_width field */
 +        gmx_nonbonded_set_kernel_pointers( (i == 0) ? log : NULL, nl);
 +
 +        /* maxnri is influenced by the number of shifts (maximum is 8)
 +         * and the number of energy groups.
 +         * If it is not enough, nl memory will be reallocated during the run.
 +         * 4 seems to be a reasonable factor, which only causes reallocation
 +         * during runs with tiny and many energygroups.
 +         */
 +        nl->maxnri      = homenr*4;
 +        nl->maxnrj      = 0;
 +        nl->nri         = -1;
 +        nl->nrj         = 0;
 +        nl->iinr        = NULL;
 +        nl->gid         = NULL;
 +        nl->shift       = NULL;
 +        nl->jindex      = NULL;
 +        nl->jjnr        = NULL;
 +        nl->excl_fep    = NULL;
 +        reallocate_nblist(nl);
 +        nl->jindex[0] = 0;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Initiating neighbourlist (ielec=%d, ivdw=%d, type=%d) for %s interactions,\nwith %d SR, %d LR atoms.\n",
 +                    nl->ielec, nl->ivdw, nl->type, gmx_nblist_geometry_names[nl->igeometry], maxsr, maxlr);
 +        }
 +    }
 +}
 +
 +void init_neighbor_list(FILE *log, t_forcerec *fr, int homenr)
 +{
 +    /* Make maxlr tunable! (does not seem to be a big difference though)
 +     * This parameter determines the number of i particles in a long range
 +     * neighbourlist. Too few means many function calls, too many means
 +     * cache trashing.
 +     */
 +    int        maxsr, maxsr_wat, maxlr, maxlr_wat;
 +    int        ielec, ielecf, ivdw, ielecmod, ielecmodf, ivdwmod, type;
 +    int        solvent;
 +    int        igeometry_def, igeometry_w, igeometry_ww;
 +    int        i;
 +    t_nblists *nbl;
 +
 +    /* maxsr     = homenr-fr->nWatMol*3; */
 +    maxsr     = homenr;
 +
 +    if (maxsr < 0)
 +    {
 +        gmx_fatal(FARGS, "%s, %d: Negative number of short range atoms.\n"
 +                  "Call your Gromacs dealer for assistance.", __FILE__, __LINE__);
 +    }
 +    /* This is just for initial allocation, so we do not reallocate
 +     * all the nlist arrays many times in a row.
 +     * The numbers seem very accurate, but they are uncritical.
 +     */
 +    maxsr_wat = min(fr->nWatMol, (homenr+2)/3);
 +    if (fr->bTwinRange)
 +    {
 +        maxlr     = 50;
 +        maxlr_wat = min(maxsr_wat, maxlr);
 +    }
 +    else
 +    {
 +        maxlr = maxlr_wat = 0;
 +    }
 +
 +    /* Determine the values for ielec/ivdw. */
 +    ielec    = fr->nbkernel_elec_interaction;
 +    ivdw     = fr->nbkernel_vdw_interaction;
 +    ielecmod = fr->nbkernel_elec_modifier;
 +    ivdwmod  = fr->nbkernel_vdw_modifier;
 +    type     = GMX_NBLIST_INTERACTION_STANDARD;
 +
 +    fr->ns.bCGlist = (getenv("GMX_NBLISTCG") != 0);
 +    if (!fr->ns.bCGlist)
 +    {
 +        igeometry_def = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +    }
 +    else
 +    {
 +        igeometry_def = GMX_NBLIST_GEOMETRY_CG_CG;
 +        if (log != NULL)
 +        {
 +            fprintf(log, "\nUsing charge-group - charge-group neighbor lists and kernels\n\n");
 +        }
 +    }
 +
 +    if (fr->solvent_opt == esolTIP4P)
 +    {
 +        igeometry_w  = GMX_NBLIST_GEOMETRY_WATER4_PARTICLE;
 +        igeometry_ww = GMX_NBLIST_GEOMETRY_WATER4_WATER4;
 +    }
 +    else
 +    {
 +        igeometry_w  = GMX_NBLIST_GEOMETRY_WATER3_PARTICLE;
 +        igeometry_ww = GMX_NBLIST_GEOMETRY_WATER3_WATER3;
 +    }
 +
 +    for (i = 0; i < fr->nnblists; i++)
 +    {
 +        nbl = &(fr->nblists[i]);
 +
 +        if ((fr->adress_type != eAdressOff) && (i >= fr->nnblists/2))
 +        {
 +            type = GMX_NBLIST_INTERACTION_ADRESS;
 +        }
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ], &nbl->nlist_lr[eNL_VDWQQ],
 +                    maxsr, maxlr, ivdw, ivdwmod, ielec, ielecmod, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDW], &nbl->nlist_lr[eNL_VDW],
 +                    maxsr, maxlr, ivdw, ivdwmod, GMX_NBKERNEL_ELEC_NONE, eintmodNONE, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ], &nbl->nlist_lr[eNL_QQ],
 +                    maxsr, maxlr, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_def, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_WATER], &nbl->nlist_lr[eNL_VDWQQ_WATER],
 +                    maxsr_wat, maxlr_wat, ivdw, ivdwmod, ielec, ielecmod, igeometry_w, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ_WATER], &nbl->nlist_lr[eNL_QQ_WATER],
 +                    maxsr_wat, maxlr_wat, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_w, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_WATERWATER], &nbl->nlist_lr[eNL_VDWQQ_WATERWATER],
 +                    maxsr_wat, maxlr_wat, ivdw, ivdwmod, ielec, ielecmod, igeometry_ww, type);
 +        init_nblist(log, &nbl->nlist_sr[eNL_QQ_WATERWATER], &nbl->nlist_lr[eNL_QQ_WATERWATER],
 +                    maxsr_wat, maxlr_wat, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielec, ielecmod, igeometry_ww, type);
 +
 +        /* Did we get the solvent loops so we can use optimized water kernels? */
 +        if (nbl->nlist_sr[eNL_VDWQQ_WATER].kernelptr_vf == NULL
 +            || nbl->nlist_sr[eNL_QQ_WATER].kernelptr_vf == NULL
 +#ifndef DISABLE_WATERWATER_NLIST
 +            || nbl->nlist_sr[eNL_VDWQQ_WATERWATER].kernelptr_vf == NULL
 +            || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf == NULL
 +#endif
 +            )
 +        {
 +            fr->solvent_opt = esolNO;
++            if (log != NULL)
++            {
++                fprintf(log, "Note: The available nonbonded kernels do not support water optimization - disabling.\n");
++            }
 +        }
 +
 +        if (fr->efep != efepNO)
 +        {
 +            if ((fr->bEwald) && (fr->sc_alphacoul > 0)) /* need to handle long range differently if using softcore */
 +            {
 +                ielecf    = GMX_NBKERNEL_ELEC_EWALD;
 +                ielecmodf = eintmodNONE;
 +            }
 +            else
 +            {
 +                ielecf    = ielec;
 +                ielecmodf = ielecmod;
 +            }
 +
 +            init_nblist(log, &nbl->nlist_sr[eNL_VDWQQ_FREE], &nbl->nlist_lr[eNL_VDWQQ_FREE],
 +                        maxsr, maxlr, ivdw, ivdwmod, ielecf, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +            init_nblist(log, &nbl->nlist_sr[eNL_VDW_FREE], &nbl->nlist_lr[eNL_VDW_FREE],
 +                        maxsr, maxlr, ivdw, ivdwmod, GMX_NBKERNEL_ELEC_NONE, eintmodNONE, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +            init_nblist(log, &nbl->nlist_sr[eNL_QQ_FREE], &nbl->nlist_lr[eNL_QQ_FREE],
 +                        maxsr, maxlr, GMX_NBKERNEL_VDW_NONE, eintmodNONE, ielecf, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +        }
 +    }
 +    /* QMMM MM list */
 +    if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +    {
 +        init_nblist(log, &fr->QMMMlist, NULL,
 +                    maxsr, maxlr, 0, 0, ielec, ielecmod, GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_STANDARD);
 +    }
 +
 +    if (log != NULL)
 +    {
 +        fprintf(log, "\n");
 +    }
 +
 +    fr->ns.nblist_initialized = TRUE;
 +}
 +
 +static void reset_nblist(t_nblist *nl)
 +{
 +    nl->nri       = -1;
 +    nl->nrj       = 0;
 +    if (nl->jindex)
 +    {
 +        nl->jindex[0] = 0;
 +    }
 +}
 +
 +static void reset_neighbor_lists(t_forcerec *fr, gmx_bool bResetSR, gmx_bool bResetLR)
 +{
 +    int n, i;
 +
 +    if (fr->bQMMM)
 +    {
 +        /* only reset the short-range nblist */
 +        reset_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for (n = 0; n < fr->nnblists; n++)
 +    {
 +        for (i = 0; i < eNL_NR; i++)
 +        {
 +            if (bResetSR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_sr[i]) );
 +            }
 +            if (bResetLR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_lr[i]) );
 +            }
 +        }
 +    }
 +}
 +
 +
 +
 +
 +static gmx_inline void new_i_nblist(t_nblist *nlist, atom_id i_atom, int shift, int gid)
 +{
 +    int    i, k, nri, nshift;
 +
 +    nri = nlist->nri;
 +
 +    /* Check whether we have to increase the i counter */
 +    if ((nri == -1) ||
 +        (nlist->iinr[nri]  != i_atom) ||
 +        (nlist->shift[nri] != shift) ||
 +        (nlist->gid[nri]   != gid))
 +    {
 +        /* This is something else. Now see if any entries have
 +         * been added in the list of the previous atom.
 +         */
 +        if ((nri == -1) ||
 +            ((nlist->jindex[nri+1] > nlist->jindex[nri]) &&
 +             (nlist->gid[nri] != -1)))
 +        {
 +            /* If so increase the counter */
 +            nlist->nri++;
 +            nri++;
 +            if (nlist->nri >= nlist->maxnri)
 +            {
 +                nlist->maxnri += over_alloc_large(nlist->nri);
 +                reallocate_nblist(nlist);
 +            }
 +        }
 +        /* Set the number of neighbours and the atom number */
 +        nlist->jindex[nri+1] = nlist->jindex[nri];
 +        nlist->iinr[nri]     = i_atom;
 +        nlist->gid[nri]      = gid;
 +        nlist->shift[nri]    = shift;
 +    }
 +    else
 +    {
 +        /* Adding to previous list. First remove possible previous padding */
 +        if (nlist->simd_padding_width > 1)
 +        {
 +            while (nlist->nrj > 0 && nlist->jjnr[nlist->nrj-1] < 0)
 +            {
 +                nlist->nrj--;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_inline void close_i_nblist(t_nblist *nlist)
 +{
 +    int nri = nlist->nri;
 +    int len;
 +
 +    if (nri >= 0)
 +    {
 +        /* Add elements up to padding. Since we allocate memory in units
 +         * of the simd_padding width, we do not have to check for possible
 +         * list reallocation here.
 +         */
 +        while ((nlist->nrj % nlist->simd_padding_width) != 0)
 +        {
 +            /* Use -4 here, so we can write forces for 4 atoms before real data */
 +            nlist->jjnr[nlist->nrj++] = -4;
 +        }
 +        nlist->jindex[nri+1] = nlist->nrj;
 +
 +        len = nlist->nrj -  nlist->jindex[nri];
 +    }
 +}
 +
 +static gmx_inline void close_nblist(t_nblist *nlist)
 +{
 +    /* Only close this nblist when it has been initialized.
 +     * Avoid the creation of i-lists with no j-particles.
 +     */
 +    if (nlist->nrj == 0)
 +    {
 +        /* Some assembly kernels do not support empty lists,
 +         * make sure here that we don't generate any empty lists.
 +         * With the current ns code this branch is taken in two cases:
 +         * No i-particles at all: nri=-1 here
 +         * There are i-particles, but no j-particles; nri=0 here
 +         */
 +        nlist->nri = 0;
 +    }
 +    else
 +    {
 +        /* Close list number nri by incrementing the count */
 +        nlist->nri++;
 +    }
 +}
 +
 +static gmx_inline void close_neighbor_lists(t_forcerec *fr, gmx_bool bMakeQMMMnblist)
 +{
 +    int n, i;
 +
 +    if (bMakeQMMMnblist)
 +    {
 +        close_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for (n = 0; n < fr->nnblists; n++)
 +    {
 +        for (i = 0; (i < eNL_NR); i++)
 +        {
 +            close_nblist(&(fr->nblists[n].nlist_sr[i]));
 +            close_nblist(&(fr->nblists[n].nlist_lr[i]));
 +        }
 +    }
 +}
 +
 +
 +static gmx_inline void add_j_to_nblist(t_nblist *nlist, atom_id j_atom, gmx_bool bLR)
 +{
 +    int nrj = nlist->nrj;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = round_up_to_simd_width(over_alloc_small(nlist->nrj + 1), nlist->simd_padding_width);
 +
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
 +                    bLR ? "LR" : "SR", nlist->ielec, nlist->ivdw, nlist->type, nlist->igeometry, nlist->maxnrj);
 +        }
 +
 +        srenew(nlist->jjnr, nlist->maxnrj);
 +    }
 +
 +    nlist->jjnr[nrj] = j_atom;
 +    nlist->nrj++;
 +}
 +
 +static gmx_inline void add_j_to_nblist_cg(t_nblist *nlist,
 +                                          atom_id j_start, int j_end,
 +                                          t_excl *bexcl, gmx_bool i_is_j,
 +                                          gmx_bool bLR)
 +{
 +    int nrj = nlist->nrj;
 +    int j;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = over_alloc_small(nlist->nrj + 1);
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
 +                    bLR ? "LR" : "SR", nlist->ielec, nlist->ivdw, nlist->type, nlist->igeometry, nlist->maxnrj);
 +        }
 +
 +        srenew(nlist->jjnr, nlist->maxnrj);
 +        srenew(nlist->jjnr_end, nlist->maxnrj);
 +        srenew(nlist->excl, nlist->maxnrj*MAX_CGCGSIZE);
 +    }
 +
 +    nlist->jjnr[nrj]     = j_start;
 +    nlist->jjnr_end[nrj] = j_end;
 +
 +    if (j_end - j_start > MAX_CGCGSIZE)
 +    {
 +        gmx_fatal(FARGS, "The charge-group - charge-group neighborlist do not support charge groups larger than %d, found a charge group of size %d", MAX_CGCGSIZE, j_end-j_start);
 +    }
 +
 +    /* Set the exclusions */
 +    for (j = j_start; j < j_end; j++)
 +    {
 +        nlist->excl[nrj*MAX_CGCGSIZE + j - j_start] = bexcl[j];
 +    }
 +    if (i_is_j)
 +    {
 +        /* Avoid double counting of intra-cg interactions */
 +        for (j = 1; j < j_end-j_start; j++)
 +        {
 +            nlist->excl[nrj*MAX_CGCGSIZE + j] |= (1<<j) - 1;
 +        }
 +    }
 +
 +    nlist->nrj++;
 +}
 +
 +typedef void
 +    put_in_list_t (gmx_bool              bHaveVdW[],
 +                   int                   ngid,
 +                   t_mdatoms     *       md,
 +                   int                   icg,
 +                   int                   jgid,
 +                   int                   nj,
 +                   atom_id               jjcg[],
 +                   atom_id               index[],
 +                   t_excl                bExcl[],
 +                   int                   shift,
 +                   t_forcerec     *      fr,
 +                   gmx_bool              bLR,
 +                   gmx_bool              bDoVdW,
 +                   gmx_bool              bDoCoul,
 +                   int                   solvent_opt);
 +
 +static void
 +put_in_list_at(gmx_bool              bHaveVdW[],
 +               int                   ngid,
 +               t_mdatoms     *       md,
 +               int                   icg,
 +               int                   jgid,
 +               int                   nj,
 +               atom_id               jjcg[],
 +               atom_id               index[],
 +               t_excl                bExcl[],
 +               int                   shift,
 +               t_forcerec     *      fr,
 +               gmx_bool              bLR,
 +               gmx_bool              bDoVdW,
 +               gmx_bool              bDoCoul,
 +               int                   solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist  *   vdwc;
 +    t_nblist  *   vdw;
 +    t_nblist  *   coul;
 +    t_nblist  *   vdwc_free  = NULL;
 +    t_nblist  *   vdw_free   = NULL;
 +    t_nblist  *   coul_free  = NULL;
 +    t_nblist  *   vdwc_ww    = NULL;
 +    t_nblist  *   coul_ww    = NULL;
 +
 +    int           i, j, jcg, igid, gid, nbl_ind, ind_ij;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg, len;
 +
 +    int          *cginfo;
 +    int          *type, *typeB;
 +    real         *charge, *chargeB;
 +    real          qi, qiB, qq, rlj;
 +    gmx_bool      bFreeEnergy, bFree, bFreeJ, bNotEx, *bPert;
 +    gmx_bool      bDoVdW_i, bDoCoul_i, bDoCoul_i_sol;
 +    int           iwater, jwater;
 +    t_nblist     *nlist;
 +
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt != esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +
 +    bFreeEnergy = FALSE;
 +    if (md->nPerturbed)
 +    {
 +        /* Check if any of the particles involved are perturbed.
 +         * If not we can do the cheaper normal put_in_list
 +         * and use more solvent optimization.
 +         */
 +        for (i = 0; i < nicg; i++)
 +        {
 +            bFreeEnergy |= bPert[i0+i];
 +        }
 +        /* Loop over the j charge groups */
 +        for (j = 0; (j < nj && !bFreeEnergy); j++)
 +        {
 +            jcg = jjcg[j];
 +            jj0 = index[jcg];
 +            jj1 = index[jcg+1];
 +            /* Finally loop over the atoms in the j-charge group */
 +            for (jj = jj0; jj < jj1; jj++)
 +            {
 +                bFreeEnergy |= bPert[jj];
 +            }
 +        }
 +    }
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[GID(igid, jgid, ngid)];
 +    }
 +    if (bLR)
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_sr;
 +    }
 +
 +    if (iwater != esolNO)
 +    {
 +        vdwc = &nlist[eNL_VDWQQ_WATER];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ_WATER];
 +#ifndef DISABLE_WATERWATER_NLIST
 +        vdwc_ww = &nlist[eNL_VDWQQ_WATERWATER];
 +        coul_ww = &nlist[eNL_QQ_WATERWATER];
 +#endif
 +    }
 +    else
 +    {
 +        vdwc = &nlist[eNL_VDWQQ];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ];
 +    }
 +
 +    if (!bFreeEnergy)
 +    {
 +        if (iwater != esolNO)
 +        {
 +            /* Loop over the atoms in the i charge group */
 +            i_atom  = i0;
 +            gid     = GID(igid, jgid, ngid);
 +            /* Create new i_atom for each energy group */
 +            if (bDoCoul && bDoVdW)
 +            {
 +                new_i_nblist(vdwc, i_atom, shift, gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(vdwc_ww, i_atom, shift, gid);
 +#endif
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw, i_atom, shift, gid);
 +            }
 +            if (bDoCoul)
 +            {
 +                new_i_nblist(coul, i_atom, shift, gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(coul_ww, i_atom, shift, gid);
 +#endif
 +            }
 +            /* Loop over the j charge groups */
 +            for (j = 0; (j < nj); j++)
 +            {
 +                jcg = jjcg[j];
 +
 +                if (jcg == icg)
 +                {
 +                    continue;
 +                }
 +
 +                jj0    = index[jcg];
 +                jwater = GET_CGINFO_SOLOPT(cginfo[jcg]);
 +
 +                if (iwater == esolSPC && jwater == esolSPC)
 +                {
 +                    /* Interaction between two SPC molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw, jj0, bLR);
 +                    }
 +                    else
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST
 +                        /* Add entries for the three atoms - only do VdW if we need to */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc, jj0, bLR);
 +                        }
 +                        add_j_to_nblist(coul, jj0+1, bLR);
 +                        add_j_to_nblist(coul, jj0+2, bLR);
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww, jj0, bLR);
 +                        }
 +#endif
 +                    }
 +                }
 +                else if (iwater == esolTIP4P && jwater == esolTIP4P)
 +                {
 +                    /* Interaction between two TIP4p molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw, jj0, bLR);
 +                    }
 +                    else
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST
 +                        /* Add entries for the four atoms - only do VdW if we need to */
 +                        if (bDoVdW)
 +                        {
 +                            add_j_to_nblist(vdw, jj0, bLR);
 +                        }
 +                        add_j_to_nblist(coul, jj0+1, bLR);
 +                        add_j_to_nblist(coul, jj0+2, bLR);
 +                        add_j_to_nblist(coul, jj0+3, bLR);
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww, jj0, bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww, jj0, bLR);
 +                        }
 +#endif
 +                    }
 +                }
 +                else
 +                {
 +                    /* j charge group is not water, but i is.
 +                     * Add entries to the water-other_atom lists; the geometry of the water
 +                     * molecule doesn't matter - that is taken care of in the nonbonded kernel,
 +                     * so we don't care if it is SPC or TIP4P...
 +                     */
 +
 +                    jj1 = index[jcg+1];
 +
 +                    if (!bDoVdW)
 +                    {
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                    else if (!bDoCoul)
 +                    {
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                add_j_to_nblist(vdw, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* _charge_ _groups_ interact with both coulomb and LJ */
 +                        /* Check which atoms we should add to the lists!       */
 +                        for (jj = jj0; (jj < jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    add_j_to_nblist(vdwc, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw, jj, bLR);
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul, jj, bLR);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +#ifndef DISABLE_WATERWATER_NLIST
 +            close_i_nblist(coul_ww);
 +            close_i_nblist(vdwc_ww);
 +#endif
 +        }
 +        else
 +        {
 +            /* no solvent as i charge group */
 +            /* Loop over the atoms in the i charge group */
 +            for (i = 0; i < nicg; i++)
 +            {
 +                i_atom  = i0+i;
 +                gid     = GID(igid, jgid, ngid);
 +                qi      = charge[i_atom];
 +
 +                /* Create new i_atom for each energy group */
 +                if (bDoVdW && bDoCoul)
 +                {
 +                    new_i_nblist(vdwc, i_atom, shift, gid);
 +                }
 +                if (bDoVdW)
 +                {
 +                    new_i_nblist(vdw, i_atom, shift, gid);
 +                }
 +                if (bDoCoul)
 +                {
 +                    new_i_nblist(coul, i_atom, shift, gid);
 +                }
 +                bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +                bDoCoul_i = (bDoCoul && qi != 0);
 +
 +                if (bDoVdW_i || bDoCoul_i)
 +                {
 +                    /* Loop over the j charge groups */
 +                    for (j = 0; (j < nj); j++)
 +                    {
 +                        jcg = jjcg[j];
 +
 +                        /* Check for large charge groups */
 +                        if (jcg == icg)
 +                        {
 +                            jj0 = i0 + i + 1;
 +                        }
 +                        else
 +                        {
 +                            jj0 = index[jcg];
 +                        }
 +
 +                        jj1 = index[jcg+1];
 +                        /* Finally loop over the atoms in the j-charge group */
 +                        for (jj = jj0; jj < jj1; jj++)
 +                        {
 +                            bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                            if (bNotEx)
 +                            {
 +                                if (!bDoVdW_i)
 +                                {
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i)
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc, jj, bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +                close_i_nblist(vdw);
 +                close_i_nblist(coul);
 +                close_i_nblist(vdwc);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* we are doing free energy */
 +        vdwc_free = &nlist[eNL_VDWQQ_FREE];
 +        vdw_free  = &nlist[eNL_VDW_FREE];
 +        coul_free = &nlist[eNL_QQ_FREE];
 +        /* Loop over the atoms in the i charge group */
 +        for (i = 0; i < nicg; i++)
 +        {
 +            i_atom  = i0+i;
 +            gid     = GID(igid, jgid, ngid);
 +            qi      = charge[i_atom];
 +            qiB     = chargeB[i_atom];
 +
 +            /* Create new i_atom for each energy group */
 +            if (bDoVdW && bDoCoul)
 +            {
 +                new_i_nblist(vdwc, i_atom, shift, gid);
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw, i_atom, shift, gid);
 +            }
 +            if (bDoCoul)
 +            {
 +                new_i_nblist(coul, i_atom, shift, gid);
 +            }
 +
 +            new_i_nblist(vdw_free, i_atom, shift, gid);
 +            new_i_nblist(coul_free, i_atom, shift, gid);
 +            new_i_nblist(vdwc_free, i_atom, shift, gid);
 +
 +            bDoVdW_i  = (bDoVdW  &&
 +                         (bHaveVdW[type[i_atom]] || bHaveVdW[typeB[i_atom]]));
 +            bDoCoul_i = (bDoCoul && (qi != 0 || qiB != 0));
 +            /* For TIP4P the first atom does not have a charge,
 +             * but the last three do. So we should still put an atom
 +             * without LJ but with charge in the water-atom neighborlist
 +             * for a TIP4p i charge group.
 +             * For SPC type water the first atom has LJ and charge,
 +             * so there is no such problem.
 +             */
 +            if (iwater == esolNO)
 +            {
 +                bDoCoul_i_sol = bDoCoul_i;
 +            }
 +            else
 +            {
 +                bDoCoul_i_sol = bDoCoul;
 +            }
 +
 +            if (bDoVdW_i || bDoCoul_i_sol)
 +            {
 +                /* Loop over the j charge groups */
 +                for (j = 0; (j < nj); j++)
 +                {
 +                    jcg = jjcg[j];
 +
 +                    /* Check for large charge groups */
 +                    if (jcg == icg)
 +                    {
 +                        jj0 = i0 + i + 1;
 +                    }
 +                    else
 +                    {
 +                        jj0 = index[jcg];
 +                    }
 +
 +                    jj1 = index[jcg+1];
 +                    /* Finally loop over the atoms in the j-charge group */
 +                    bFree = bPert[i_atom];
 +                    for (jj = jj0; (jj < jj1); jj++)
 +                    {
 +                        bFreeJ = bFree || bPert[jj];
 +                        /* Complicated if, because the water H's should also
 +                         * see perturbed j-particles
 +                         */
 +                        if (iwater == esolNO || i == 0 || bFreeJ)
 +                        {
 +                            bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                            if (bNotEx)
 +                            {
 +                                if (bFreeJ)
 +                                {
 +                                    if (!bDoVdW_i)
 +                                    {
 +                                        if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(coul_free, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (!bDoCoul_i)
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            add_j_to_nblist(vdw_free, jj, bLR);
 +                                        }
 +                                    }
 +                                    else
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                            {
 +                                                add_j_to_nblist(vdwc_free, jj, bLR);
 +                                            }
 +                                            else
 +                                            {
 +                                                add_j_to_nblist(vdw_free, jj, bLR);
 +                                            }
 +                                        }
 +                                        else if (charge[jj] != 0 || chargeB[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(coul_free, jj, bLR);
 +                                        }
 +                                    }
 +                                }
 +                                else if (!bDoVdW_i)
 +                                {
 +                                    /* This is done whether or not bWater is set */
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i_sol)
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc, jj, bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw, jj, bLR);
 +                                        }
 +                                    }
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul, jj, bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_free);
 +            close_i_nblist(coul_free);
 +            close_i_nblist(vdwc_free);
 +        }
 +    }
 +}
 +
 +static void
 +put_in_list_adress(gmx_bool              bHaveVdW[],
 +                   int                   ngid,
 +                   t_mdatoms     *       md,
 +                   int                   icg,
 +                   int                   jgid,
 +                   int                   nj,
 +                   atom_id               jjcg[],
 +                   atom_id               index[],
 +                   t_excl                bExcl[],
 +                   int                   shift,
 +                   t_forcerec     *      fr,
 +                   gmx_bool              bLR,
 +                   gmx_bool              bDoVdW,
 +                   gmx_bool              bDoCoul,
 +                   int                   solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist  *   vdwc;
 +    t_nblist  *   vdw;
 +    t_nblist  *   coul;
 +    t_nblist  *   vdwc_adress  = NULL;
 +    t_nblist  *   vdw_adress   = NULL;
 +    t_nblist  *   coul_adress  = NULL;
 +    t_nblist  *   vdwc_ww      = NULL;
 +    t_nblist  *   coul_ww      = NULL;
 +
 +    int           i, j, jcg, igid, gid, nbl_ind, nbl_ind_adress;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg, len;
 +
 +    int          *cginfo;
 +    int          *type, *typeB;
 +    real         *charge, *chargeB;
 +    real         *wf;
 +    real          qi, qiB, qq, rlj;
 +    gmx_bool      bFreeEnergy, bFree, bFreeJ, bNotEx, *bPert;
 +    gmx_bool      bDoVdW_i, bDoCoul_i, bDoCoul_i_sol;
 +    gmx_bool      b_hybrid;
 +    gmx_bool      j_all_atom;
 +    int           iwater, jwater;
 +    t_nblist     *nlist, *nlist_adress;
 +    gmx_bool      bEnergyGroupCG;
 +
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +    wf      = md->wf;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt != esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +
 +    if (md->nPerturbed)
 +    {
 +        gmx_fatal(FARGS, "AdResS does not support free energy pertubation\n");
 +    }
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 2)
 +    {
 +        nbl_ind        = 0;
 +        nbl_ind_adress = 1;
 +    }
 +    else
 +    {
 +        nbl_ind        = fr->gid2nblists[GID(igid, jgid, ngid)];
 +        nbl_ind_adress = nbl_ind+fr->nnblists/2;
 +    }
 +    if (bLR)
 +    {
 +        nlist        = fr->nblists[nbl_ind].nlist_lr;
 +        nlist_adress = fr->nblists[nbl_ind_adress].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist        = fr->nblists[nbl_ind].nlist_sr;
 +        nlist_adress = fr->nblists[nbl_ind_adress].nlist_sr;
 +    }
 +
 +
 +    vdwc = &nlist[eNL_VDWQQ];
 +    vdw  = &nlist[eNL_VDW];
 +    coul = &nlist[eNL_QQ];
 +
 +    vdwc_adress = &nlist_adress[eNL_VDWQQ];
 +    vdw_adress  = &nlist_adress[eNL_VDW];
 +    coul_adress = &nlist_adress[eNL_QQ];
 +
 +    /* We do not support solvent optimization with AdResS for now.
 +       For this we would need hybrid solvent-other kernels */
 +
 +    /* no solvent as i charge group */
 +    /* Loop over the atoms in the i charge group */
 +    for (i = 0; i < nicg; i++)
 +    {
 +        i_atom  = i0+i;
 +        gid     = GID(igid, jgid, ngid);
 +        qi      = charge[i_atom];
 +
 +        /* Create new i_atom for each energy group */
 +        if (bDoVdW && bDoCoul)
 +        {
 +            new_i_nblist(vdwc, i_atom, shift, gid);
 +            new_i_nblist(vdwc_adress, i_atom, shift, gid);
 +
 +        }
 +        if (bDoVdW)
 +        {
 +            new_i_nblist(vdw, i_atom, shift, gid);
 +            new_i_nblist(vdw_adress, i_atom, shift, gid);
 +
 +        }
 +        if (bDoCoul)
 +        {
 +            new_i_nblist(coul, i_atom, shift, gid);
 +            new_i_nblist(coul_adress, i_atom, shift, gid);
 +        }
 +        bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +        bDoCoul_i = (bDoCoul && qi != 0);
 +
 +        /* Here we find out whether the energy groups interaction belong to a
 +         * coarse-grained (vsite) or atomistic interaction. Note that, beacuse
 +         * interactions between coarse-grained and other (atomistic) energygroups
 +         * are excluded automatically by grompp, it is sufficient to check for
 +         * the group id of atom i (igid) */
 +        bEnergyGroupCG = !egp_explicit(fr, igid);
 +
 +        if (bDoVdW_i || bDoCoul_i)
 +        {
 +            /* Loop over the j charge groups */
 +            for (j = 0; (j < nj); j++)
 +            {
 +                jcg = jjcg[j];
 +
 +                /* Check for large charge groups */
 +                if (jcg == icg)
 +                {
 +                    jj0 = i0 + i + 1;
 +                }
 +                else
 +                {
 +                    jj0 = index[jcg];
 +                }
 +
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for (jj = jj0; jj < jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl, i, jj);
 +
 +                    /* Now we have to exclude interactions which will be zero
 +                     * anyway due to the AdResS weights (in previous implementations
 +                     * this was done in the force kernel). This is necessary as
 +                     * pure interactions (those with b_hybrid=false, i.e. w_i*w_j==1 or 0)
 +                     * are put into neighbour lists which will be passed to the
 +                     * standard (optimized) kernels for speed. The interactions with
 +                     * b_hybrid=true are placed into the _adress neighbour lists and
 +                     * processed by the generic AdResS kernel.
 +                     */
 +                    if ( (bEnergyGroupCG &&
 +                          wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS ) ||
 +                         ( !bEnergyGroupCG && wf[jj] <= GMX_REAL_EPS ) )
 +                    {
 +                        continue;
 +                    }
 +
 +                    b_hybrid = !((wf[i_atom] >= 1-GMX_REAL_EPS && wf[jj] >= 1-GMX_REAL_EPS) ||
 +                                 (wf[i_atom] <= GMX_REAL_EPS && wf[jj] <= GMX_REAL_EPS));
 +
 +                    if (bNotEx)
 +                    {
 +                        if (!bDoVdW_i)
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(coul, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(coul_adress, jj, bLR);
 +                                }
 +                            }
 +                        }
 +                        else if (!bDoCoul_i)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(vdw, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw_adress, jj, bLR);
 +                                }
 +                            }
 +                        }
 +                        else
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    if (!b_hybrid)
 +                                    {
 +                                        add_j_to_nblist(vdwc, jj, bLR);
 +                                    }
 +                                    else
 +                                    {
 +                                        add_j_to_nblist(vdwc_adress, jj, bLR);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    if (!b_hybrid)
 +                                    {
 +                                        add_j_to_nblist(vdw, jj, bLR);
 +                                    }
 +                                    else
 +                                    {
 +                                        add_j_to_nblist(vdw_adress, jj, bLR);
 +                                    }
 +
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                if (!b_hybrid)
 +                                {
 +                                    add_j_to_nblist(coul, jj, bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(coul_adress, jj, bLR);
 +                                }
 +
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_adress);
 +            close_i_nblist(coul_adress);
 +            close_i_nblist(vdwc_adress);
 +        }
 +    }
 +}
 +
 +static void
 +put_in_list_qmmm(gmx_bool gmx_unused              bHaveVdW[],
 +                 int                              ngid,
 +                 t_mdatoms gmx_unused     *       md,
 +                 int                              icg,
 +                 int                              jgid,
 +                 int                              nj,
 +                 atom_id                          jjcg[],
 +                 atom_id                          index[],
 +                 t_excl                           bExcl[],
 +                 int                              shift,
 +                 t_forcerec                *      fr,
 +                 gmx_bool                         bLR,
 +                 gmx_bool  gmx_unused             bDoVdW,
 +                 gmx_bool  gmx_unused             bDoCoul,
 +                 int       gmx_unused             solvent_opt)
 +{
 +    t_nblist  *   coul;
 +    int           i, j, jcg, igid, gid;
 +    atom_id       jj, jj0, jj1, i_atom;
 +    int           i0, nicg;
 +    gmx_bool      bNotEx;
 +
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(fr->cginfo[icg]);
 +
 +    coul = &fr->QMMMlist;
 +
 +    /* Loop over atoms in the ith charge group */
 +    for (i = 0; i < nicg; i++)
 +    {
 +        i_atom = i0+i;
 +        gid    = GID(igid, jgid, ngid);
 +        /* Create new i_atom for each energy group */
 +        new_i_nblist(coul, i_atom, shift, gid);
 +
 +        /* Loop over the j charge groups */
 +        for (j = 0; j < nj; j++)
 +        {
 +            jcg = jjcg[j];
 +
 +            /* Charge groups cannot have QM and MM atoms simultaneously */
 +            if (jcg != icg)
 +            {
 +                jj0 = index[jcg];
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for (jj = jj0; jj < jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl, i, jj);
 +                    if (bNotEx)
 +                    {
 +                        add_j_to_nblist(coul, jj, bLR);
 +                    }
 +                }
 +            }
 +        }
 +        close_i_nblist(coul);
 +    }
 +}
 +
 +static void
 +put_in_list_cg(gmx_bool  gmx_unused             bHaveVdW[],
 +               int                              ngid,
 +               t_mdatoms  gmx_unused    *       md,
 +               int                              icg,
 +               int                              jgid,
 +               int                              nj,
 +               atom_id                          jjcg[],
 +               atom_id                          index[],
 +               t_excl                           bExcl[],
 +               int                              shift,
 +               t_forcerec                *      fr,
 +               gmx_bool                         bLR,
 +               gmx_bool   gmx_unused            bDoVdW,
 +               gmx_bool   gmx_unused            bDoCoul,
 +               int        gmx_unused            solvent_opt)
 +{
 +    int          cginfo;
 +    int          igid, gid, nbl_ind;
 +    t_nblist *   vdwc;
 +    int          j, jcg;
 +
 +    cginfo = fr->cginfo[icg];
 +
 +    igid = GET_CGINFO_GID(cginfo);
 +    gid  = GID(igid, jgid, ngid);
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[gid];
 +    }
 +    if (bLR)
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_lr[eNL_VDWQQ];
 +    }
 +    else
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_sr[eNL_VDWQQ];
 +    }
 +
 +    /* Make a new neighbor list for charge group icg.
 +     * Currently simply one neighbor list is made with LJ and Coulomb.
 +     * If required, zero interactions could be removed here
 +     * or in the force loop.
 +     */
 +    new_i_nblist(vdwc, index[icg], shift, gid);
 +    vdwc->iinr_end[vdwc->nri] = index[icg+1];
 +
 +    for (j = 0; (j < nj); j++)
 +    {
 +        jcg = jjcg[j];
 +        /* Skip the icg-icg pairs if all self interactions are excluded */
 +        if (!(jcg == icg && GET_CGINFO_EXCL_INTRA(cginfo)))
 +        {
 +            /* Here we add the j charge group jcg to the list,
 +             * exclusions are also added to the list.
 +             */
 +            add_j_to_nblist_cg(vdwc, index[jcg], index[jcg+1], bExcl, icg == jcg, bLR);
 +        }
 +    }
 +
 +    close_i_nblist(vdwc);
 +}
 +
 +static void setexcl(atom_id start, atom_id end, t_blocka *excl, gmx_bool b,
 +                    t_excl bexcl[])
 +{
 +    atom_id i, k;
 +
 +    if (b)
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            for (k = excl->index[i]; k < excl->index[i+1]; k++)
 +            {
 +                SETEXCL(bexcl, i-start, excl->a[k]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            for (k = excl->index[i]; k < excl->index[i+1]; k++)
 +            {
 +                RMEXCL(bexcl, i-start, excl->a[k]);
 +            }
 +        }
 +    }
 +}
 +
 +int calc_naaj(int icg, int cgtot)
 +{
 +    int naaj;
 +
 +    if ((cgtot % 2) == 1)
 +    {
 +        /* Odd number of charge groups, easy */
 +        naaj = 1 + (cgtot/2);
 +    }
 +    else if ((cgtot % 4) == 0)
 +    {
 +        /* Multiple of four is hard */
 +        if (icg < cgtot/2)
 +        {
 +            if ((icg % 2) == 0)
 +            {
 +                naaj = 1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj = cgtot/2;
 +            }
 +        }
 +        else
 +        {
 +            if ((icg % 2) == 1)
 +            {
 +                naaj = 1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj = cgtot/2;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* cgtot/2 = odd */
 +        if ((icg % 2) == 0)
 +        {
 +            naaj = 1+(cgtot/2);
 +        }
 +        else
 +        {
 +            naaj = cgtot/2;
 +        }
 +    }
 +#ifdef DEBUG
 +    fprintf(log, "naaj=%d\n", naaj);
 +#endif
 +
 +    return naaj;
 +}
 +
 +/************************************************
 + *
 + *  S I M P L E      C O R E     S T U F F
 + *
 + ************************************************/
 +
 +static real calc_image_tric(rvec xi, rvec xj, matrix box,
 +                            rvec b_inv, int *shift)
 +{
 +    /* This code assumes that the cut-off is smaller than
 +     * a half times the smallest diagonal element of the box.
 +     */
 +    const real h25 = 2.5;
 +    real       dx, dy, dz;
 +    real       r2;
 +    int        tx, ty, tz;
 +
 +    /* Compute diff vector */
 +    dz = xj[ZZ] - xi[ZZ];
 +    dy = xj[YY] - xi[YY];
 +    dx = xj[XX] - xi[XX];
 +
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 2.5 then subtract 2 again
 +     */
 +    tz  = dz*b_inv[ZZ] + h25;
 +    tz -= 2;
 +    dz -= tz*box[ZZ][ZZ];
 +    dy -= tz*box[ZZ][YY];
 +    dx -= tz*box[ZZ][XX];
 +
 +    ty  = dy*b_inv[YY] + h25;
 +    ty -= 2;
 +    dy -= ty*box[YY][YY];
 +    dx -= ty*box[YY][XX];
 +
 +    tx  = dx*b_inv[XX]+h25;
 +    tx -= 2;
 +    dx -= tx*box[XX][XX];
 +
 +    /* Distance squared */
 +    r2 = (dx*dx) + (dy*dy) + (dz*dz);
 +
 +    *shift = XYZ2IS(tx, ty, tz);
 +
 +    return r2;
 +}
 +
 +static real calc_image_rect(rvec xi, rvec xj, rvec box_size,
 +                            rvec b_inv, int *shift)
 +{
 +    const real h15 = 1.5;
 +    real       ddx, ddy, ddz;
 +    real       dx, dy, dz;
 +    real       r2;
 +    int        tx, ty, tz;
 +
 +    /* Compute diff vector */
 +    dx = xj[XX] - xi[XX];
 +    dy = xj[YY] - xi[YY];
 +    dz = xj[ZZ] - xi[ZZ];
 +
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 1.5 then subtract 1 again
 +     */
 +    tx = dx*b_inv[XX] + h15;
 +    ty = dy*b_inv[YY] + h15;
 +    tz = dz*b_inv[ZZ] + h15;
 +    tx--;
 +    ty--;
 +    tz--;
 +
 +    /* Correct diff vector for translation */
 +    ddx = tx*box_size[XX] - dx;
 +    ddy = ty*box_size[YY] - dy;
 +    ddz = tz*box_size[ZZ] - dz;
 +
 +    /* Distance squared */
 +    r2 = (ddx*ddx) + (ddy*ddy) + (ddz*ddz);
 +
 +    *shift = XYZ2IS(tx, ty, tz);
 +
 +    return r2;
 +}
 +
 +static void add_simple(t_ns_buf *nsbuf, int nrj, atom_id cg_j,
 +                       gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                       int icg, int jgid, t_block *cgs, t_excl bexcl[],
 +                       int shift, t_forcerec *fr, put_in_list_t *put_in_list)
 +{
 +    if (nsbuf->nj + nrj > MAX_CG)
 +    {
 +        put_in_list(bHaveVdW, ngid, md, icg, jgid, nsbuf->ncg, nsbuf->jcg,
 +                    cgs->index, bexcl, shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +        /* Reset buffer contents */
 +        nsbuf->ncg = nsbuf->nj = 0;
 +    }
 +    nsbuf->jcg[nsbuf->ncg++] = cg_j;
 +    nsbuf->nj               += nrj;
 +}
 +
 +static void ns_inner_tric(rvec x[], int icg, int *i_egp_flags,
 +                          int njcg, atom_id jcg[],
 +                          matrix box, rvec b_inv, real rcut2,
 +                          t_block *cgs, t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                          t_excl bexcl[], t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int       shift;
 +    int       j, nrj, jgid;
 +    int      *cginfo = fr->cginfo;
 +    atom_id   cg_j, *cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    shift   = CENTRAL;
 +    for (j = 0; (j < njcg); j++)
 +    {
 +        cg_j   = jcg[j];
 +        nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +        if (calc_image_tric(x[icg], x[cg_j], box, b_inv, &shift) < rcut2)
 +        {
 +            jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +            if (!(i_egp_flags[jgid] & EGP_EXCL))
 +            {
 +                add_simple(&ns_buf[jgid][shift], nrj, cg_j,
 +                           bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, shift, fr,
 +                           put_in_list);
 +            }
 +        }
 +    }
 +}
 +
 +static void ns_inner_rect(rvec x[], int icg, int *i_egp_flags,
 +                          int njcg, atom_id jcg[],
 +                          gmx_bool bBox, rvec box_size, rvec b_inv, real rcut2,
 +                          t_block *cgs, t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[], int ngid, t_mdatoms *md,
 +                          t_excl bexcl[], t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int       shift;
 +    int       j, nrj, jgid;
 +    int      *cginfo = fr->cginfo;
 +    atom_id   cg_j, *cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    if (bBox)
 +    {
 +        shift = CENTRAL;
 +        for (j = 0; (j < njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if (calc_image_rect(x[icg], x[cg_j], box_size, b_inv, &shift) < rcut2)
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][shift], nrj, cg_j,
 +                               bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, shift, fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (j = 0; (j < njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if ((rcut2 == 0) || (distance2(x[icg], x[cg_j]) < rcut2))
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][CENTRAL], nrj, cg_j,
 +                               bHaveVdW, ngid, md, icg, jgid, cgs, bexcl, CENTRAL, fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* ns_simple_core needs to be adapted for QMMM still 2005 */
 +
 +static int ns_simple_core(t_forcerec *fr,
 +                          gmx_localtop_t *top,
 +                          t_mdatoms *md,
 +                          matrix box, rvec box_size,
 +                          t_excl bexcl[], atom_id *aaj,
 +                          int ngid, t_ns_buf **ns_buf,
 +                          put_in_list_t *put_in_list, gmx_bool bHaveVdW[])
 +{
 +    int          naaj, k;
 +    real         rlist2;
 +    int          nsearch, icg, jcg, igid, i0, nri, nn;
 +    int         *cginfo;
 +    t_ns_buf    *nsbuf;
 +    /* atom_id  *i_atoms; */
 +    t_block     *cgs  = &(top->cgs);
 +    t_blocka    *excl = &(top->excls);
 +    rvec         b_inv;
 +    int          m;
 +    gmx_bool     bBox, bTriclinic;
 +    int         *i_egp_flags;
 +
 +    rlist2 = sqr(fr->rlist);
 +
 +    bBox = (fr->ePBC != epbcNONE);
 +    if (bBox)
 +    {
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            b_inv[m] = divide_err(1.0, box_size[m]);
 +        }
 +        bTriclinic = TRICLINIC(box);
 +    }
 +    else
 +    {
 +        bTriclinic = FALSE;
 +    }
 +
 +    cginfo = fr->cginfo;
 +
 +    nsearch = 0;
 +    for (icg = fr->cg0; (icg < fr->hcg); icg++)
 +    {
 +        /*
 +           i0        = cgs->index[icg];
 +           nri       = cgs->index[icg+1]-i0;
 +           i_atoms   = &(cgs->a[i0]);
 +           i_eg_excl = fr->eg_excl + ngid*md->cENER[*i_atoms];
 +           setexcl(nri,i_atoms,excl,TRUE,bexcl);
 +         */
 +        igid        = GET_CGINFO_GID(cginfo[icg]);
 +        i_egp_flags = fr->egp_flags + ngid*igid;
 +        setexcl(cgs->index[icg], cgs->index[icg+1], excl, TRUE, bexcl);
 +
 +        naaj = calc_naaj(icg, cgs->nr);
 +        if (bTriclinic)
 +        {
 +            ns_inner_tric(fr->cg_cm, icg, i_egp_flags, naaj, &(aaj[icg]),
 +                          box, b_inv, rlist2, cgs, ns_buf,
 +                          bHaveVdW, ngid, md, bexcl, fr, put_in_list);
 +        }
 +        else
 +        {
 +            ns_inner_rect(fr->cg_cm, icg, i_egp_flags, naaj, &(aaj[icg]),
 +                          bBox, box_size, b_inv, rlist2, cgs, ns_buf,
 +                          bHaveVdW, ngid, md, bexcl, fr, put_in_list);
 +        }
 +        nsearch += naaj;
 +
 +        for (nn = 0; (nn < ngid); nn++)
 +        {
 +            for (k = 0; (k < SHIFTS); k++)
 +            {
 +                nsbuf = &(ns_buf[nn][k]);
 +                if (nsbuf->ncg > 0)
 +                {
 +                    put_in_list(bHaveVdW, ngid, md, icg, nn, nsbuf->ncg, nsbuf->jcg,
 +                                cgs->index, bexcl, k, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                    nsbuf->ncg = nsbuf->nj = 0;
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg], cgs->index[icg+1], excl, FALSE, bexcl);
 +    }
 +    close_neighbor_lists(fr, FALSE);
 +
 +    return nsearch;
 +}
 +
 +/************************************************
 + *
 + *    N S 5     G R I D     S T U F F
 + *
 + ************************************************/
 +
 +static gmx_inline void get_dx(int Nx, real gridx, real rc2, int xgi, real x,
 +                              int *dx0, int *dx1, real *dcx2)
 +{
 +    real dcx, tmp;
 +    int  xgi0, xgi1, i;
 +
 +    if (xgi < 0)
 +    {
 +        *dx0 = 0;
 +        xgi0 = -1;
 +        *dx1 = -1;
 +        xgi1 = 0;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        *dx0 = Nx;
 +        xgi0 = Nx-1;
 +        *dx1 = Nx-1;
 +        xgi1 = Nx;
 +    }
 +    else
 +    {
 +        dcx2[xgi] = 0;
 +        *dx0      = xgi;
 +        xgi0      = xgi-1;
 +        *dx1      = xgi;
 +        xgi1      = xgi+1;
 +    }
 +
 +    for (i = xgi0; i >= 0; i--)
 +    {
 +        dcx = (i+1)*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx0    = i;
 +        dcx2[i] = tmp;
 +    }
 +    for (i = xgi1; i < Nx; i++)
 +    {
 +        dcx = i*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx1    = i;
 +        dcx2[i] = tmp;
 +    }
 +}
 +
 +static gmx_inline void get_dx_dd(int Nx, real gridx, real rc2, int xgi, real x,
 +                                 int ncpddc, int shift_min, int shift_max,
 +                                 int *g0, int *g1, real *dcx2)
 +{
 +    real dcx, tmp;
 +    int  g_min, g_max, shift_home;
 +
 +    if (xgi < 0)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = 0;
 +        *g1   = -1;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = Nx;
 +        *g1   = Nx - 1;
 +    }
 +    else
 +    {
 +        if (ncpddc == 0)
 +        {
 +            g_min = 0;
 +            g_max = Nx - 1;
 +        }
 +        else
 +        {
 +            if (xgi < ncpddc)
 +            {
 +                shift_home = 0;
 +            }
 +            else
 +            {
 +                shift_home = -1;
 +            }
 +            g_min = (shift_min == shift_home ? 0          : ncpddc);
 +            g_max = (shift_max == shift_home ? ncpddc - 1 : Nx - 1);
 +        }
 +        if (shift_min > 0)
 +        {
 +            *g0 = g_min;
 +            *g1 = g_min - 1;
 +        }
 +        else if (shift_max < 0)
 +        {
 +            *g0 = g_max + 1;
 +            *g1 = g_max;
 +        }
 +        else
 +        {
 +            *g0       = xgi;
 +            *g1       = xgi;
 +            dcx2[xgi] = 0;
 +        }
 +    }
 +
 +    while (*g0 > g_min)
 +    {
 +        /* Check one grid cell down */
 +        dcx = ((*g0 - 1) + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g0)--;
 +        dcx2[*g0] = tmp;
 +    }
 +
 +    while (*g1 < g_max)
 +    {
 +        /* Check one grid cell up */
 +        dcx = (*g1 + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g1)++;
 +        dcx2[*g1] = tmp;
 +    }
 +}
 +
 +
 +#define sqr(x) ((x)*(x))
 +#define calc_dx2(XI, YI, ZI, y) (sqr(XI-y[XX]) + sqr(YI-y[YY]) + sqr(ZI-y[ZZ]))
 +#define calc_cyl_dx2(XI, YI, y) (sqr(XI-y[XX]) + sqr(YI-y[YY]))
 +/****************************************************
 + *
 + *    F A S T   N E I G H B O R  S E A R C H I N G
 + *
 + *    Optimized neighboursearching routine using grid
 + *    at least 1x1x1, see GROMACS manual
 + *
 + ****************************************************/
 +
 +
 +static void get_cutoff2(t_forcerec *fr, gmx_bool bDoLongRange,
 +                        real *rvdw2, real *rcoul2,
 +                        real *rs2, real *rm2, real *rl2)
 +{
 +    *rs2 = sqr(fr->rlist);
 +
 +    if (bDoLongRange && fr->bTwinRange)
 +    {
 +        /* With plain cut-off or RF we need to make the list exactly
 +         * up to the cut-off and the cut-off's can be different,
 +         * so we can not simply set them to rlistlong.
 +         * To keep this code compatible with (exotic) old cases,
 +         * we also create lists up to rvdw/rcoulomb for PME and Ewald.
 +         * The interaction check should correspond to:
 +         * !ir_vdw/coulomb_might_be_zero_at_cutoff from inputrec.c.
 +         */
 +        if (((fr->vdwtype == evdwCUT || fr->vdwtype == evdwPME) &&
 +             fr->vdw_modifier == eintmodNONE) ||
 +            fr->rvdw <= fr->rlist)
 +        {
 +            *rvdw2  = sqr(fr->rvdw);
 +        }
 +        else
 +        {
 +            *rvdw2  = sqr(fr->rlistlong);
 +        }
 +        if (((fr->eeltype == eelCUT ||
 +              (EEL_RF(fr->eeltype) && fr->eeltype != eelRF_ZERO) ||
 +              fr->eeltype == eelPME ||
 +              fr->eeltype == eelEWALD) &&
 +             fr->coulomb_modifier == eintmodNONE) ||
 +            fr->rcoulomb <= fr->rlist)
 +        {
 +            *rcoul2 = sqr(fr->rcoulomb);
 +        }
 +        else
 +        {
 +            *rcoul2 = sqr(fr->rlistlong);
 +        }
 +    }
 +    else
 +    {
 +        /* Workaround for a gcc -O3 or -ffast-math problem */
 +        *rvdw2  = *rs2;
 +        *rcoul2 = *rs2;
 +    }
 +    *rm2 = min(*rvdw2, *rcoul2);
 +    *rl2 = max(*rvdw2, *rcoul2);
 +}
 +
 +static void init_nsgrid_lists(t_forcerec *fr, int ngid, gmx_ns_t *ns)
 +{
 +    real rvdw2, rcoul2, rs2, rm2, rl2;
 +    int  j;
 +
 +    get_cutoff2(fr, TRUE, &rvdw2, &rcoul2, &rs2, &rm2, &rl2);
 +
 +    /* Short range buffers */
 +    snew(ns->nl_sr, ngid);
 +    /* Counters */
 +    snew(ns->nsr, ngid);
 +    snew(ns->nlr_ljc, ngid);
 +    snew(ns->nlr_one, ngid);
 +
 +    /* Always allocate both list types, since rcoulomb might now change with PME load balancing */
 +    /* Long range VdW and Coul buffers */
 +    snew(ns->nl_lr_ljc, ngid);
 +    /* Long range VdW or Coul only buffers */
 +    snew(ns->nl_lr_one, ngid);
 +
 +    for (j = 0; (j < ngid); j++)
 +    {
 +        snew(ns->nl_sr[j], MAX_CG);
 +        snew(ns->nl_lr_ljc[j], MAX_CG);
 +        snew(ns->nl_lr_one[j], MAX_CG);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "ns5_core: rs2 = %g, rm2 = %g, rl2 = %g (nm^2)\n",
 +                rs2, rm2, rl2);
 +    }
 +}
 +
 +static int nsgrid_core(t_commrec *cr, t_forcerec *fr,
 +                       matrix box, int ngid,
 +                       gmx_localtop_t *top,
 +                       t_grid *grid,
 +                       t_excl bexcl[], gmx_bool *bExcludeAlleg,
 +                       t_mdatoms *md,
 +                       put_in_list_t *put_in_list,
 +                       gmx_bool bHaveVdW[],
 +                       gmx_bool bDoLongRange, gmx_bool bMakeQMMMnblist)
 +{
 +    gmx_ns_t     *ns;
 +    atom_id     **nl_lr_ljc, **nl_lr_one, **nl_sr;
 +    int          *nlr_ljc, *nlr_one, *nsr;
 +    gmx_domdec_t *dd     = NULL;
 +    t_block      *cgs    = &(top->cgs);
 +    int          *cginfo = fr->cginfo;
 +    /* atom_id *i_atoms,*cgsindex=cgs->index; */
 +    ivec          sh0, sh1, shp;
 +    int           cell_x, cell_y, cell_z;
 +    int           d, tx, ty, tz, dx, dy, dz, cj;
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    int           zsh_ty, zsh_tx, ysh_tx;
 +#endif
 +    int           dx0, dx1, dy0, dy1, dz0, dz1;
 +    int           Nx, Ny, Nz, shift = -1, j, nrj, nns, nn = -1;
 +    real          gridx, gridy, gridz, grid_x, grid_y, grid_z;
 +    real         *dcx2, *dcy2, *dcz2;
 +    int           zgi, ygi, xgi;
 +    int           cg0, cg1, icg = -1, cgsnr, i0, igid, nri, naaj, max_jcg;
 +    int           jcg0, jcg1, jjcg, cgj0, jgid;
 +    int          *grida, *gridnra, *gridind;
 +    gmx_bool      rvdw_lt_rcoul, rcoul_lt_rvdw;
 +    rvec          xi, *cgcm, grid_offset;
 +    real          r2, rs2, rvdw2, rcoul2, rm2, rl2, XI, YI, ZI, dcx, dcy, dcz, tmp1, tmp2;
 +    int          *i_egp_flags;
 +    gmx_bool      bDomDec, bTriclinicX, bTriclinicY;
 +    ivec          ncpddc;
 +
 +    ns = &fr->ns;
 +
 +    bDomDec = DOMAINDECOMP(cr);
 +    if (bDomDec)
 +    {
 +        dd = cr->dd;
 +    }
 +
 +    bTriclinicX = ((YY < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[YY] == 1) && box[YY][XX] != 0) ||
 +                   (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ] == 1) && box[ZZ][XX] != 0));
 +    bTriclinicY =  (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ] == 1) && box[ZZ][YY] != 0);
 +
 +    cgsnr    = cgs->nr;
 +
 +    get_cutoff2(fr, bDoLongRange, &rvdw2, &rcoul2, &rs2, &rm2, &rl2);
 +
 +    rvdw_lt_rcoul = (rvdw2 >= rcoul2);
 +    rcoul_lt_rvdw = (rcoul2 >= rvdw2);
 +
 +    if (bMakeQMMMnblist)
 +    {
 +        rm2 = rl2;
 +        rs2 = rl2;
 +    }
 +
 +    nl_sr     = ns->nl_sr;
 +    nsr       = ns->nsr;
 +    nl_lr_ljc = ns->nl_lr_ljc;
 +    nl_lr_one = ns->nl_lr_one;
 +    nlr_ljc   = ns->nlr_ljc;
 +    nlr_one   = ns->nlr_one;
 +
 +    /* Unpack arrays */
 +    cgcm    = fr->cg_cm;
 +    Nx      = grid->n[XX];
 +    Ny      = grid->n[YY];
 +    Nz      = grid->n[ZZ];
 +    grida   = grid->a;
 +    gridind = grid->index;
 +    gridnra = grid->nra;
 +    nns     = 0;
 +
 +    gridx      = grid->cell_size[XX];
 +    gridy      = grid->cell_size[YY];
 +    gridz      = grid->cell_size[ZZ];
 +    grid_x     = 1/gridx;
 +    grid_y     = 1/gridy;
 +    grid_z     = 1/gridz;
 +    copy_rvec(grid->cell_offset, grid_offset);
 +    copy_ivec(grid->ncpddc, ncpddc);
 +    dcx2       = grid->dcx2;
 +    dcy2       = grid->dcy2;
 +    dcz2       = grid->dcz2;
 +
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    zsh_ty = floor(-box[ZZ][YY]/box[YY][YY]+0.5);
 +    zsh_tx = floor(-box[ZZ][XX]/box[XX][XX]+0.5);
 +    ysh_tx = floor(-box[YY][XX]/box[XX][XX]+0.5);
 +    if (zsh_tx != 0 && ysh_tx != 0)
 +    {
 +        /* This could happen due to rounding, when both ratios are 0.5 */
 +        ysh_tx = 0;
 +    }
 +#endif
 +
 +    debug_gmx();
 +
 +    if (fr->n_tpi)
 +    {
 +        /* We only want a list for the test particle */
 +        cg0 = cgsnr - 1;
 +    }
 +    else
 +    {
 +        cg0 = grid->icg0;
 +    }
 +    cg1 = grid->icg1;
 +
 +    /* Set the shift range */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        sh0[d] = -1;
 +        sh1[d] = 1;
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(fr->ePBC) || (bDomDec && dd->nc[d] > 1))
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    /* Loop over charge groups */
 +    for (icg = cg0; (icg < cg1); icg++)
 +    {
 +        igid = GET_CGINFO_GID(cginfo[icg]);
 +        /* Skip this charge group if all energy groups are excluded! */
 +        if (bExcludeAlleg[igid])
 +        {
 +            continue;
 +        }
 +
 +        i0   = cgs->index[icg];
 +
 +        if (bMakeQMMMnblist)
 +        {
 +            /* Skip this charge group if it is not a QM atom while making a
 +             * QM/MM neighbourlist
 +             */
 +            if (md->bQM[i0] == FALSE)
 +            {
 +                continue; /* MM particle, go to next particle */
 +            }
 +
 +            /* Compute the number of charge groups that fall within the control
 +             * of this one (icg)
 +             */
 +            naaj    = calc_naaj(icg, cgsnr);
 +            jcg0    = icg;
 +            jcg1    = icg + naaj;
 +            max_jcg = cgsnr;
 +        }
 +        else
 +        {
 +            /* make a normal neighbourlist */
 +
 +            if (bDomDec)
 +            {
 +                /* Get the j charge-group and dd cell shift ranges */
 +                dd_get_ns_ranges(cr->dd, icg, &jcg0, &jcg1, sh0, sh1);
 +                max_jcg = 0;
 +            }
 +            else
 +            {
 +                /* Compute the number of charge groups that fall within the control
 +                 * of this one (icg)
 +                 */
 +                naaj = calc_naaj(icg, cgsnr);
 +                jcg0 = icg;
 +                jcg1 = icg + naaj;
 +
 +                if (fr->n_tpi)
 +                {
 +                    /* The i-particle is awlways the test particle,
 +                     * so we want all j-particles
 +                     */
 +                    max_jcg = cgsnr - 1;
 +                }
 +                else
 +                {
 +                    max_jcg  = jcg1 - cgsnr;
 +                }
 +            }
 +        }
 +
 +        i_egp_flags = fr->egp_flags + igid*ngid;
 +
 +        /* Set the exclusions for the atoms in charge group icg using a bitmask */
 +        setexcl(i0, cgs->index[icg+1], &top->excls, TRUE, bexcl);
 +
 +        ci2xyz(grid, icg, &cell_x, &cell_y, &cell_z);
 +
 +        /* Changed iicg to icg, DvdS 990115
 +         * (but see consistency check above, DvdS 990330)
 +         */
 +#ifdef NS5DB
 +        fprintf(log, "icg=%5d, naaj=%5d, cell %d %d %d\n",
 +                icg, naaj, cell_x, cell_y, cell_z);
 +#endif
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
 +        {
 +            ZI = cgcm[icg][ZZ]+tz*box[ZZ][ZZ];
 +            /* Calculate range of cells in Z direction that have the shift tz */
 +            zgi = cell_z + tz*Nz;
 +#define FAST_DD_NS
 +#ifndef FAST_DD_NS
 +            get_dx(Nz, gridz, rl2, zgi, ZI, &dz0, &dz1, dcz2);
 +#else
 +            get_dx_dd(Nz, gridz, rl2, zgi, ZI-grid_offset[ZZ],
 +                      ncpddc[ZZ], sh0[ZZ], sh1[ZZ], &dz0, &dz1, dcz2);
 +#endif
 +            if (dz0 > dz1)
 +            {
 +                continue;
 +            }
 +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
 +            {
 +                YI = cgcm[icg][YY]+ty*box[YY][YY]+tz*box[ZZ][YY];
 +                /* Calculate range of cells in Y direction that have the shift ty */
 +                if (bTriclinicY)
 +                {
 +                    ygi = (int)(Ny + (YI - grid_offset[YY])*grid_y) - Ny;
 +                }
 +                else
 +                {
 +                    ygi = cell_y + ty*Ny;
 +                }
 +#ifndef FAST_DD_NS
 +                get_dx(Ny, gridy, rl2, ygi, YI, &dy0, &dy1, dcy2);
 +#else
 +                get_dx_dd(Ny, gridy, rl2, ygi, YI-grid_offset[YY],
 +                          ncpddc[YY], sh0[YY], sh1[YY], &dy0, &dy1, dcy2);
 +#endif
 +                if (dy0 > dy1)
 +                {
 +                    continue;
 +                }
 +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
 +                {
 +                    XI = cgcm[icg][XX]+tx*box[XX][XX]+ty*box[YY][XX]+tz*box[ZZ][XX];
 +                    /* Calculate range of cells in X direction that have the shift tx */
 +                    if (bTriclinicX)
 +                    {
 +                        xgi = (int)(Nx + (XI - grid_offset[XX])*grid_x) - Nx;
 +                    }
 +                    else
 +                    {
 +                        xgi = cell_x + tx*Nx;
 +                    }
 +#ifndef FAST_DD_NS
 +                    get_dx(Nx, gridx, rl2, xgi*Nx, XI, &dx0, &dx1, dcx2);
 +#else
 +                    get_dx_dd(Nx, gridx, rl2, xgi, XI-grid_offset[XX],
 +                              ncpddc[XX], sh0[XX], sh1[XX], &dx0, &dx1, dcx2);
 +#endif
 +                    if (dx0 > dx1)
 +                    {
 +                        continue;
 +                    }
 +                    /* Adress: an explicit cg that has a weigthing function of 0 is excluded
 +                     *  from the neigbour list as it will not interact  */
 +                    if (fr->adress_type != eAdressOff)
 +                    {
 +                        if (md->wf[cgs->index[icg]] <= GMX_REAL_EPS && egp_explicit(fr, igid))
 +                        {
 +                            continue;
 +                        }
 +                    }
 +                    /* Get shift vector */
 +                    shift = XYZ2IS(tx, ty, tz);
 +#ifdef NS5DB
 +                    range_check(shift, 0, SHIFTS);
 +#endif
 +                    for (nn = 0; (nn < ngid); nn++)
 +                    {
 +                        nsr[nn]      = 0;
 +                        nlr_ljc[nn]  = 0;
 +                        nlr_one[nn]  = 0;
 +                    }
 +#ifdef NS5DB
 +                    fprintf(log, "shift: %2d, dx0,1: %2d,%2d, dy0,1: %2d,%2d, dz0,1: %2d,%2d\n",
 +                            shift, dx0, dx1, dy0, dy1, dz0, dz1);
 +                    fprintf(log, "cgcm: %8.3f  %8.3f  %8.3f\n", cgcm[icg][XX],
 +                            cgcm[icg][YY], cgcm[icg][ZZ]);
 +                    fprintf(log, "xi:   %8.3f  %8.3f  %8.3f\n", XI, YI, ZI);
 +#endif
 +                    for (dx = dx0; (dx <= dx1); dx++)
 +                    {
 +                        tmp1 = rl2 - dcx2[dx];
 +                        for (dy = dy0; (dy <= dy1); dy++)
 +                        {
 +                            tmp2 = tmp1 - dcy2[dy];
 +                            if (tmp2 > 0)
 +                            {
 +                                for (dz = dz0; (dz <= dz1); dz++)
 +                                {
 +                                    if (tmp2 > dcz2[dz])
 +                                    {
 +                                        /* Find grid-cell cj in which possible neighbours are */
 +                                        cj   = xyz2ci(Ny, Nz, dx, dy, dz);
 +
 +                                        /* Check out how many cgs (nrj) there in this cell */
 +                                        nrj  = gridnra[cj];
 +
 +                                        /* Find the offset in the cg list */
 +                                        cgj0 = gridind[cj];
 +
 +                                        /* Check if all j's are out of range so we
 +                                         * can skip the whole cell.
 +                                         * Should save some time, especially with DD.
 +                                         */
 +                                        if (nrj == 0 ||
 +                                            (grida[cgj0] >= max_jcg &&
 +                                             (grida[cgj0] >= jcg1 || grida[cgj0+nrj-1] < jcg0)))
 +                                        {
 +                                            continue;
 +                                        }
 +
 +                                        /* Loop over cgs */
 +                                        for (j = 0; (j < nrj); j++)
 +                                        {
 +                                            jjcg = grida[cgj0+j];
 +
 +                                            /* check whether this guy is in range! */
 +                                            if ((jjcg >= jcg0 && jjcg < jcg1) ||
 +                                                (jjcg < max_jcg))
 +                                            {
 +                                                r2 = calc_dx2(XI, YI, ZI, cgcm[jjcg]);
 +                                                if (r2 < rl2)
 +                                                {
 +                                                    /* jgid = gid[cgsatoms[cgsindex[jjcg]]]; */
 +                                                    jgid = GET_CGINFO_GID(cginfo[jjcg]);
 +                                                    /* check energy group exclusions */
 +                                                    if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                                                    {
 +                                                        if (r2 < rs2)
 +                                                        {
 +                                                            if (nsr[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to short-range list */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nsr[jgid], nl_sr[jgid],
 +                                                                            cgs->index, /* cgsatoms, */ bexcl,
 +                                                                            shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                                                                nsr[jgid] = 0;
 +                                                            }
 +                                                            nl_sr[jgid][nsr[jgid]++] = jjcg;
 +                                                        }
 +                                                        else if (r2 < rm2)
 +                                                        {
 +                                                            if (nlr_ljc[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to LJ+coulomb long-range list */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nlr_ljc[jgid], nl_lr_ljc[jgid], top->cgs.index,
 +                                                                            bexcl, shift, fr, TRUE, TRUE, TRUE, fr->solvent_opt);
 +                                                                nlr_ljc[jgid] = 0;
 +                                                            }
 +                                                            nl_lr_ljc[jgid][nlr_ljc[jgid]++] = jjcg;
 +                                                        }
 +                                                        else
 +                                                        {
 +                                                            if (nlr_one[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to long-range list with only coul, or only LJ */
 +                                                                put_in_list(bHaveVdW, ngid, md, icg, jgid,
 +                                                                            nlr_one[jgid], nl_lr_one[jgid], top->cgs.index,
 +                                                                            bexcl, shift, fr, TRUE, rvdw_lt_rcoul, rcoul_lt_rvdw, fr->solvent_opt);
 +                                                                nlr_one[jgid] = 0;
 +                                                            }
 +                                                            nl_lr_one[jgid][nlr_one[jgid]++] = jjcg;
 +                                                        }
 +                                                    }
 +                                                }
 +                                                nns++;
 +                                            }
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                    /* CHECK whether there is anything left in the buffers */
 +                    for (nn = 0; (nn < ngid); nn++)
 +                    {
 +                        if (nsr[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nsr[nn], nl_sr[nn],
 +                                        cgs->index, /* cgsatoms, */ bexcl,
 +                                        shift, fr, FALSE, TRUE, TRUE, fr->solvent_opt);
 +                        }
 +
 +                        if (nlr_ljc[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nlr_ljc[nn],
 +                                        nl_lr_ljc[nn], top->cgs.index,
 +                                        bexcl, shift, fr, TRUE, TRUE, TRUE, fr->solvent_opt);
 +                        }
 +
 +                        if (nlr_one[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW, ngid, md, icg, nn, nlr_one[nn],
 +                                        nl_lr_one[nn], top->cgs.index,
 +                                        bexcl, shift, fr, TRUE, rvdw_lt_rcoul, rcoul_lt_rvdw, fr->solvent_opt);
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,&top->atoms.excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg], cgs->index[icg+1], &top->excls, FALSE, bexcl);
 +    }
 +    /* No need to perform any left-over force calculations anymore (as we used to do here)
 +     * since we now save the proper long-range lists for later evaluation.
 +     */
 +
 +    debug_gmx();
 +
 +    /* Close neighbourlists */
 +    close_neighbor_lists(fr, bMakeQMMMnblist);
 +
 +    return nns;
 +}
 +
 +void ns_realloc_natoms(gmx_ns_t *ns, int natoms)
 +{
 +    int i;
 +
 +    if (natoms > ns->nra_alloc)
 +    {
 +        ns->nra_alloc = over_alloc_dd(natoms);
 +        srenew(ns->bexcl, ns->nra_alloc);
 +        for (i = 0; i < ns->nra_alloc; i++)
 +        {
 +            ns->bexcl[i] = 0;
 +        }
 +    }
 +}
 +
 +void init_ns(FILE *fplog, const t_commrec *cr,
 +             gmx_ns_t *ns, t_forcerec *fr,
 +             const gmx_mtop_t *mtop)
 +{
 +    int  mt, icg, nr_in_cg, maxcg, i, j, jcg, ngid, ncg;
 +    t_block *cgs;
 +    char *ptr;
 +
 +    /* Compute largest charge groups size (# atoms) */
 +    nr_in_cg = 1;
 +    for (mt = 0; mt < mtop->nmoltype; mt++)
 +    {
 +        cgs = &mtop->moltype[mt].cgs;
 +        for (icg = 0; (icg < cgs->nr); icg++)
 +        {
 +            nr_in_cg = max(nr_in_cg, (int)(cgs->index[icg+1]-cgs->index[icg]));
 +        }
 +    }
 +
 +    /* Verify whether largest charge group is <= max cg.
 +     * This is determined by the type of the local exclusion type
 +     * Exclusions are stored in bits. (If the type is not large
 +     * enough, enlarge it, unsigned char -> unsigned short -> unsigned long)
 +     */
 +    maxcg = sizeof(t_excl)*8;
 +    if (nr_in_cg > maxcg)
 +    {
 +        gmx_fatal(FARGS, "Max #atoms in a charge group: %d > %d\n",
 +                  nr_in_cg, maxcg);
 +    }
 +
 +    ngid = mtop->groups.grps[egcENER].nr;
 +    snew(ns->bExcludeAlleg, ngid);
 +    for (i = 0; i < ngid; i++)
 +    {
 +        ns->bExcludeAlleg[i] = TRUE;
 +        for (j = 0; j < ngid; j++)
 +        {
 +            if (!(fr->egp_flags[i*ngid+j] & EGP_EXCL))
 +            {
 +                ns->bExcludeAlleg[i] = FALSE;
 +            }
 +        }
 +    }
 +
 +    if (fr->bGrid)
 +    {
 +        /* Grid search */
 +        ns->grid = init_grid(fplog, fr);
 +        init_nsgrid_lists(fr, ngid, ns);
 +    }
 +    else
 +    {
 +        /* Simple search */
 +        snew(ns->ns_buf, ngid);
 +        for (i = 0; (i < ngid); i++)
 +        {
 +            snew(ns->ns_buf[i], SHIFTS);
 +        }
 +        ncg = ncg_mtop(mtop);
 +        snew(ns->simple_aaj, 2*ncg);
 +        for (jcg = 0; (jcg < ncg); jcg++)
 +        {
 +            ns->simple_aaj[jcg]     = jcg;
 +            ns->simple_aaj[jcg+ncg] = jcg;
 +        }
 +    }
 +
 +    /* Create array that determines whether or not atoms have VdW */
 +    snew(ns->bHaveVdW, fr->ntype);
 +    for (i = 0; (i < fr->ntype); i++)
 +    {
 +        for (j = 0; (j < fr->ntype); j++)
 +        {
 +            ns->bHaveVdW[i] = (ns->bHaveVdW[i] ||
 +                               (fr->bBHAM ?
 +                                ((BHAMA(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (BHAMB(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (BHAMC(fr->nbfp, fr->ntype, i, j) != 0)) :
 +                                ((C6(fr->nbfp, fr->ntype, i, j) != 0) ||
 +                                 (C12(fr->nbfp, fr->ntype, i, j) != 0))));
 +        }
 +    }
 +    if (debug)
 +    {
 +        pr_bvec(debug, 0, "bHaveVdW", ns->bHaveVdW, fr->ntype, TRUE);
 +    }
 +
 +    ns->nra_alloc = 0;
 +    ns->bexcl     = NULL;
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        ns_realloc_natoms(ns, mtop->natoms);
 +    }
 +
 +    ns->nblist_initialized = FALSE;
 +
 +    /* nbr list debug dump */
 +    {
 +        char *ptr = getenv("GMX_DUMP_NL");
 +        if (ptr)
 +        {
 +            ns->dump_nl = strtol(ptr, NULL, 10);
 +            if (fplog)
 +            {
 +                fprintf(fplog, "GMX_DUMP_NL = %d", ns->dump_nl);
 +            }
 +        }
 +        else
 +        {
 +            ns->dump_nl = 0;
 +        }
 +    }
 +}
 +
 +
 +int search_neighbours(FILE *log, t_forcerec *fr,
 +                      matrix box,
 +                      gmx_localtop_t *top,
 +                      gmx_groups_t *groups,
 +                      t_commrec *cr,
 +                      t_nrnb *nrnb, t_mdatoms *md,
 +                      gmx_bool bFillGrid,
 +                      gmx_bool bDoLongRangeNS)
 +{
 +    t_block  *cgs = &(top->cgs);
 +    rvec     box_size, grid_x0, grid_x1;
 +    int      i, j, m, ngid;
 +    real     min_size, grid_dens;
 +    int      nsearch;
 +    gmx_bool     bGrid;
 +    char     *ptr;
 +    gmx_bool     *i_egp_flags;
 +    int      cg_start, cg_end, start, end;
 +    gmx_ns_t *ns;
 +    t_grid   *grid;
 +    gmx_domdec_zones_t *dd_zones;
 +    put_in_list_t *put_in_list;
 +
 +    ns = &fr->ns;
 +
 +    /* Set some local variables */
 +    bGrid = fr->bGrid;
 +    ngid  = groups->grps[egcENER].nr;
 +
 +    for (m = 0; (m < DIM); m++)
 +    {
 +        box_size[m] = box[m][m];
 +    }
 +
 +    if (fr->ePBC != epbcNONE)
 +    {
 +        if (sqr(fr->rlistlong) >= max_cutoff2(fr->ePBC, box))
 +        {
 +            gmx_fatal(FARGS, "One of the box vectors has become shorter than twice the cut-off length or box_yy-|box_zy| or box_zz has become smaller than the cut-off.");
 +        }
 +        if (!bGrid)
 +        {
 +            min_size = min(box_size[XX], min(box_size[YY], box_size[ZZ]));
 +            if (2*fr->rlistlong >= min_size)
 +            {
 +                gmx_fatal(FARGS, "One of the box diagonal elements has become smaller than twice the cut-off length.");
 +            }
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        ns_realloc_natoms(ns, cgs->index[cgs->nr]);
 +    }
 +    debug_gmx();
 +
 +    /* Reset the neighbourlists */
 +    reset_neighbor_lists(fr, TRUE, TRUE);
 +
 +    if (bGrid && bFillGrid)
 +    {
 +
 +        grid = ns->grid;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_zones = domdec_zones(cr->dd);
 +        }
 +        else
 +        {
 +            dd_zones = NULL;
 +
 +            get_nsgrid_boundaries(grid->nboundeddim, box, NULL, NULL, NULL, NULL,
 +                                  cgs->nr, fr->cg_cm, grid_x0, grid_x1, &grid_dens);
 +
 +            grid_first(log, grid, NULL, NULL, box, grid_x0, grid_x1,
 +                       fr->rlistlong, grid_dens);
 +        }
 +        debug_gmx();
 +
 +        start = 0;
 +        end   = cgs->nr;
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            end = cgs->nr;
 +            fill_grid(dd_zones, grid, end, -1, end, fr->cg_cm);
 +            grid->icg0 = 0;
 +            grid->icg1 = dd_zones->izone[dd_zones->nizone-1].cg1;
 +        }
 +        else
 +        {
 +            fill_grid(NULL, grid, cgs->nr, fr->cg0, fr->hcg, fr->cg_cm);
 +            grid->icg0 = fr->cg0;
 +            grid->icg1 = fr->hcg;
 +            debug_gmx();
 +        }
 +
 +        calc_elemnr(grid, start, end, cgs->nr);
 +        calc_ptrs(grid);
 +        grid_last(grid, start, end, cgs->nr);
 +
 +        if (gmx_debug_at)
 +        {
 +            check_grid(grid);
 +            print_grid(debug, grid);
 +        }
 +    }
 +    else if (fr->n_tpi)
 +    {
 +        /* Set the grid cell index for the test particle only.
 +         * The cell to cg index is not corrected, but that does not matter.
 +         */
 +        fill_grid(NULL, ns->grid, fr->hcg, fr->hcg-1, fr->hcg, fr->cg_cm);
 +    }
 +    debug_gmx();
 +
 +    if (fr->adress_type == eAdressOff)
 +    {
 +        if (!fr->ns.bCGlist)
 +        {
 +            put_in_list = put_in_list_at;
 +        }
 +        else
 +        {
 +            put_in_list = put_in_list_cg;
 +        }
 +    }
 +    else
 +    {
 +        put_in_list = put_in_list_adress;
 +    }
 +
 +    /* Do the core! */
 +    if (bGrid)
 +    {
 +        grid    = ns->grid;
 +        nsearch = nsgrid_core(cr, fr, box, ngid, top,
 +                              grid, ns->bexcl, ns->bExcludeAlleg,
 +                              md, put_in_list, ns->bHaveVdW,
 +                              bDoLongRangeNS, FALSE);
 +
 +        /* neighbour searching withouth QMMM! QM atoms have zero charge in
 +         * the classical calculation. The charge-charge interaction
 +         * between QM and MM atoms is handled in the QMMM core calculation
 +         * (see QMMM.c). The VDW however, we'd like to compute classically
 +         * and the QM MM atom pairs have just been put in the
 +         * corresponding neighbourlists. in case of QMMM we still need to
 +         * fill a special QMMM neighbourlist that contains all neighbours
 +         * of the QM atoms. If bQMMM is true, this list will now be made:
 +         */
 +        if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +        {
 +            nsearch += nsgrid_core(cr, fr, box, ngid, top,
 +                                   grid, ns->bexcl, ns->bExcludeAlleg,
 +                                   md, put_in_list_qmmm, ns->bHaveVdW,
 +                                   bDoLongRangeNS, TRUE);
 +        }
 +    }
 +    else
 +    {
 +        nsearch = ns_simple_core(fr, top, md, box, box_size,
 +                                 ns->bexcl, ns->simple_aaj,
 +                                 ngid, ns->ns_buf, put_in_list, ns->bHaveVdW);
 +    }
 +    debug_gmx();
 +
 +#ifdef DEBUG
 +    pr_nsblock(log);
 +#endif
 +
 +    inc_nrnb(nrnb, eNR_NS, nsearch);
 +    /* inc_nrnb(nrnb,eNR_LR,fr->nlr); */
 +
 +    return nsearch;
 +}
 +
 +int natoms_beyond_ns_buffer(t_inputrec *ir, t_forcerec *fr, t_block *cgs,
 +                            matrix scale_tot, rvec *x)
 +{
 +    int  cg0, cg1, cg, a0, a1, a, i, j;
 +    real rint, hbuf2, scale;
 +    rvec *cg_cm, cgsc;
 +    gmx_bool bIsotropic;
 +    int  nBeyond;
 +
 +    nBeyond = 0;
 +
 +    rint = max(ir->rcoulomb, ir->rvdw);
 +    if (ir->rlist < rint)
 +    {
 +        gmx_fatal(FARGS, "The neighbor search buffer has negative size: %f nm",
 +                  ir->rlist - rint);
 +    }
 +    cg_cm = fr->cg_cm;
 +
 +    cg0 = fr->cg0;
 +    cg1 = fr->hcg;
 +
 +    if (!EI_DYNAMICS(ir->eI) || !DYNAMIC_BOX(*ir))
 +    {
 +        hbuf2 = sqr(0.5*(ir->rlist - rint));
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            a0 = cgs->index[cg];
 +            a1 = cgs->index[cg+1];
 +            for (a = a0; a < a1; a++)
 +            {
 +                if (distance2(cg_cm[cg], x[a]) > hbuf2)
 +                {
 +                    nBeyond++;
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        bIsotropic = TRUE;
 +        scale      = scale_tot[0][0];
 +        for (i = 1; i < DIM; i++)
 +        {
 +            /* With anisotropic scaling, the original spherical ns volumes become
 +             * ellipsoids. To avoid costly transformations we use the minimum
 +             * eigenvalue of the scaling matrix for determining the buffer size.
 +             * Since the lower half is 0, the eigenvalues are the diagonal elements.
 +             */
 +            scale = min(scale, scale_tot[i][i]);
 +            if (scale_tot[i][i] != scale_tot[i-1][i-1])
 +            {
 +                bIsotropic = FALSE;
 +            }
 +            for (j = 0; j < i; j++)
 +            {
 +                if (scale_tot[i][j] != 0)
 +                {
 +                    bIsotropic = FALSE;
 +                }
 +            }
 +        }
 +        hbuf2 = sqr(0.5*(scale*ir->rlist - rint));
 +        if (bIsotropic)
 +        {
 +            for (cg = cg0; cg < cg1; cg++)
 +            {
 +                svmul(scale, cg_cm[cg], cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for (a = a0; a < a1; a++)
 +                {
 +                    if (distance2(cgsc, x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Anistropic scaling */
 +            for (cg = cg0; cg < cg1; cg++)
 +            {
 +                /* Since scale_tot contains the transpose of the scaling matrix,
 +                 * we need to multiply with the transpose.
 +                 */
 +                tmvmul_ur0(scale_tot, cg_cm[cg], cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for (a = a0; a < a1; a++)
 +                {
 +                    if (distance2(cgsc, x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return nBeyond;
 +}