From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Wed, 25 Sep 2013 09:16:18 +0000 (+0200)
Subject: Merge branch release-4-6
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ffbffe8d9b43b9f2b00bab1ad0ed4188b4b100a1;p=alexxy%2Fgromacs.git

Merge branch release-4-6

Conflicts:
	src/gromacs/gmxlib/bondfree.c
	src/gromacs/legacyheaders/bondf.h
	src/gromacs/legacyheaders/pme.h
	src/gromacs/mdlib/pme.c
	src/programs/mdrun/runner.c

Conflicts all resolved in favour of release-4-6 branch,
except where the master-branch removal of unused parameters
was correct.

Change-Id: Ib32a52b97b5443b86b4f0e4d575767990dc6c47e
---

ffbffe8d9b43b9f2b00bab1ad0ed4188b4b100a1
diff --cc src/gromacs/gmxana/gmx_tune_pme.c
index 9231655c1e,0000000000..d611bca21e
mode 100644,000000..100644
--- a/src/gromacs/gmxana/gmx_tune_pme.c
+++ b/src/gromacs/gmxana/gmx_tune_pme.c
@@@ -1,2514 -1,0 +1,2521 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +#include <time.h>
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +
 +
 +#include "statutil.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "vec.h"
 +#include "copyrite.h"
 +#include "statutil.h"
 +#include "tpxio.h"
 +#include "string2.h"
 +#include "readinp.h"
 +#include "calcgrid.h"
 +#include "checkpoint.h"
 +#include "macros.h"
 +#include "gmx_ana.h"
 +#include "names.h"
 +#include "perf_est.h"
 +
 +
 +
 +/* Enum for situations that can occur during log file parsing, the
 + * corresponding string entries can be found in do_the_tests() in
 + * const char* ParseLog[] */
 +enum {
 +    eParselogOK,
 +    eParselogNotFound,
 +    eParselogNoPerfData,
 +    eParselogTerm,
 +    eParselogResetProblem,
 +    eParselogNoDDGrid,
 +    eParselogTPXVersion,
 +    eParselogNotParallel,
++    eParselogLargePrimeFactor,
 +    eParselogFatal,
 +    eParselogNr
 +};
 +
 +
 +typedef struct
 +{
 +    int     nPMEnodes;    /* number of PME-only nodes used in this test */
 +    int     nx, ny, nz;   /* DD grid */
 +    int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
 +    double *Gcycles;      /* This can contain more than one value if doing multiple tests */
 +    double  Gcycles_Av;
 +    float  *ns_per_day;
 +    float   ns_per_day_Av;
 +    float  *PME_f_load;     /* PME mesh/force load average*/
 +    float   PME_f_load_Av;  /* Average average ;) ... */
 +    char   *mdrun_cmd_line; /* Mdrun command line used for this test */
 +} t_perf;
 +
 +
 +typedef struct
 +{
 +    int             nr_inputfiles;  /* The number of tpr and mdp input files */
 +    gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 +    gmx_large_int_t orig_init_step; /* Init step for the real simulation */
 +    real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 +    real           *rvdw;           /* The vdW radii */
 +    real           *rlist;          /* Neighbourlist cutoff radius */
 +    real           *rlistlong;
 +    int            *nkx, *nky, *nkz;
 +    real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 +} t_inputinfo;
 +
 +
 +static void sep_line(FILE *fp)
 +{
 +    fprintf(fp, "\n------------------------------------------------------------\n");
 +}
 +
 +
 +/* Wrapper for system calls */
 +static int gmx_system_call(char *command)
 +{
 +#ifdef GMX_NO_SYSTEM
 +    gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 +#else
 +    return ( system(command) );
 +#endif
 +}
 +
 +
 +/* Check if string starts with substring */
 +static gmx_bool str_starts(const char *string, const char *substring)
 +{
 +    return ( strncmp(string, substring, strlen(substring)) == 0);
 +}
 +
 +
 +static void cleandata(t_perf *perfdata, int test_nr)
 +{
 +    perfdata->Gcycles[test_nr]    = 0.0;
 +    perfdata->ns_per_day[test_nr] = 0.0;
 +    perfdata->PME_f_load[test_nr] = 0.0;
 +
 +    return;
 +}
 +
 +
 +static gmx_bool is_equal(real a, real b)
 +{
 +    real diff, eps = 1.0e-7;
 +
 +
 +    diff = a - b;
 +
 +    if (diff < 0.0)
 +    {
 +        diff = -diff;
 +    }
 +
 +    if (diff < eps)
 +    {
 +        return TRUE;
 +    }
 +    else
 +    {
 +        return FALSE;
 +    }
 +}
 +
 +
 +static void finalize(const char *fn_out)
 +{
 +    char  buf[STRLEN];
 +    FILE *fp;
 +
 +
 +    fp = fopen(fn_out, "r");
 +    fprintf(stdout, "\n\n");
 +
 +    while (fgets(buf, STRLEN-1, fp) != NULL)
 +    {
 +        fprintf(stdout, "%s", buf);
 +    }
 +    fclose(fp);
 +    fprintf(stdout, "\n\n");
 +}
 +
 +
 +enum {
 +    eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 +};
 +
 +static int parse_logfile(const char *logfile, const char *errfile,
 +                         t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 +                         int nnodes)
 +{
 +    FILE           *fp;
 +    char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 +    const char      matchstrdd[]  = "Domain decomposition grid";
 +    const char      matchstrcr[]  = "resetting all time and cycle counters";
 +    const char      matchstrbal[] = "Average PME mesh/force load:";
 +    const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 +    const char      errSIG[]      = "signal, stopping at the next";
 +    int             iFound;
 +    int             procs;
 +    float           dum1, dum2, dum3, dum4;
 +    int             ndum;
 +    int             npme;
 +    gmx_large_int_t resetsteps     = -1;
 +    gmx_bool        bFoundResetStr = FALSE;
 +    gmx_bool        bResetChecked  = FALSE;
 +
 +
 +    if (!gmx_fexist(logfile))
 +    {
 +        fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 +        cleandata(perfdata, test_nr);
 +        return eParselogNotFound;
 +    }
 +
 +    fp = fopen(logfile, "r");
 +    perfdata->PME_f_load[test_nr] = -1.0;
 +    perfdata->guessPME            = -1;
 +
 +    iFound = eFoundNothing;
 +    if (1 == nnodes)
 +    {
 +        iFound = eFoundDDStr; /* Skip some case statements */
 +    }
 +
 +    while (fgets(line, STRLEN, fp) != NULL)
 +    {
 +        /* Remove leading spaces */
 +        ltrim(line);
 +
 +        /* Check for TERM and INT signals from user: */
 +        if (strstr(line, errSIG) != NULL)
 +        {
 +            fclose(fp);
 +            cleandata(perfdata, test_nr);
 +            return eParselogTerm;
 +        }
 +
 +        /* Check whether cycle resetting  worked */
 +        if (presteps > 0 && !bFoundResetStr)
 +        {
 +            if (strstr(line, matchstrcr) != NULL)
 +            {
 +                sprintf(dumstring, "step %s", gmx_large_int_pfmt);
 +                sscanf(line, dumstring, &resetsteps);
 +                bFoundResetStr = TRUE;
 +                if (resetsteps == presteps+cpt_steps)
 +                {
 +                    bResetChecked = TRUE;
 +                }
 +                else
 +                {
 +                    sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
 +                    sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 +                    fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 +                            "         though they were supposed to be reset at step %s!\n",
 +                            dumstring, dumstring2);
 +                }
 +            }
 +        }
 +
 +        /* Look for strings that appear in a certain order in the log file: */
 +        switch (iFound)
 +        {
 +            case eFoundNothing:
 +                /* Look for domain decomp grid and separate PME nodes: */
 +                if (str_starts(line, matchstrdd))
 +                {
 +                    sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 +                           &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 +                    if (perfdata->nPMEnodes == -1)
 +                    {
 +                        perfdata->guessPME = npme;
 +                    }
 +                    else if (perfdata->nPMEnodes != npme)
 +                    {
 +                        gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 +                    }
 +                    iFound = eFoundDDStr;
 +                }
 +                /* Catch a few errors that might have occured: */
 +                else if (str_starts(line, "There is no domain decomposition for"))
 +                {
 +                    fclose(fp);
 +                    return eParselogNoDDGrid;
 +                }
++                else if (str_starts(line, "The number of nodes you selected"))
++                {
++                    fclose(fp);
++                    return eParselogLargePrimeFactor;
++                }
 +                else if (str_starts(line, "reading tpx file"))
 +                {
 +                    fclose(fp);
 +                    return eParselogTPXVersion;
 +                }
 +                else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 +                {
 +                    fclose(fp);
 +                    return eParselogNotParallel;
 +                }
 +                break;
 +            case eFoundDDStr:
 +                /* Look for PME mesh/force balance (not necessarily present, though) */
 +                if (str_starts(line, matchstrbal))
 +                {
 +                    sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 +                }
 +                /* Look for matchstring */
 +                if (str_starts(line, matchstring))
 +                {
 +                    iFound = eFoundAccountingStr;
 +                }
 +                break;
 +            case eFoundAccountingStr:
 +                /* Already found matchstring - look for cycle data */
 +                if (str_starts(line, "Total  "))
 +                {
 +                    sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
 +                    iFound = eFoundCycleStr;
 +                }
 +                break;
 +            case eFoundCycleStr:
 +                /* Already found cycle data - look for remaining performance info and return */
 +                if (str_starts(line, "Performance:"))
 +                {
 +                    ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 +                    /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 +                    perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 +                    fclose(fp);
 +                    if (bResetChecked || presteps == 0)
 +                    {
 +                        return eParselogOK;
 +                    }
 +                    else
 +                    {
 +                        return eParselogResetProblem;
 +                    }
 +                }
 +                break;
 +        }
 +    } /* while */
 +
 +    /* Close the log file */
 +    fclose(fp);
 +
 +    /* Check why there is no performance data in the log file.
 +     * Did a fatal errors occur? */
 +    if (gmx_fexist(errfile))
 +    {
 +        fp = fopen(errfile, "r");
 +        while (fgets(line, STRLEN, fp) != NULL)
 +        {
 +            if (str_starts(line, "Fatal error:") )
 +            {
 +                if (fgets(line, STRLEN, fp) != NULL)
 +                {
 +                    fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 +                            "%s\n", line);
 +                }
 +                fclose(fp);
 +                cleandata(perfdata, test_nr);
 +                return eParselogFatal;
 +            }
 +        }
 +        fclose(fp);
 +    }
 +    else
 +    {
 +        fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 +    }
 +
 +    /* Giving up ... we could not find out why there is no performance data in
 +     * the log file. */
 +    fprintf(stdout, "No performance data in log file.\n");
 +    cleandata(perfdata, test_nr);
 +
 +    return eParselogNoPerfData;
 +}
 +
 +
 +static gmx_bool analyze_data(
 +        FILE         *fp,
 +        const char   *fn,
 +        t_perf      **perfdata,
 +        int           nnodes,
 +        int           ntprs,
 +        int           ntests,
 +        int           nrepeats,
 +        t_inputinfo  *info,
 +        int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 +        int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 +{
 +    int      i, j, k;
 +    int      line  = 0, line_win = -1;
 +    int      k_win = -1, i_win = -1, winPME;
 +    double   s     = 0.0; /* standard deviation */
 +    t_perf  *pd;
 +    char     strbuf[STRLEN];
 +    char     str_PME_f_load[13];
 +    gmx_bool bCanUseOrigTPR;
 +    gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 +
 +
 +    if (nrepeats > 1)
 +    {
 +        sep_line(fp);
 +        fprintf(fp, "Summary of successful runs:\n");
 +        fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 +        if (nnodes > 1)
 +        {
 +            fprintf(fp, "    DD grid");
 +        }
 +        fprintf(fp, "\n");
 +    }
 +
 +
 +    for (k = 0; k < ntprs; k++)
 +    {
 +        for (i = 0; i < ntests; i++)
 +        {
 +            /* Select the right dataset: */
 +            pd = &(perfdata[k][i]);
 +
 +            pd->Gcycles_Av    = 0.0;
 +            pd->PME_f_load_Av = 0.0;
 +            pd->ns_per_day_Av = 0.0;
 +
 +            if (pd->nPMEnodes == -1)
 +            {
 +                sprintf(strbuf, "(%3d)", pd->guessPME);
 +            }
 +            else
 +            {
 +                sprintf(strbuf, "     ");
 +            }
 +
 +            /* Get the average run time of a setting */
 +            for (j = 0; j < nrepeats; j++)
 +            {
 +                pd->Gcycles_Av    += pd->Gcycles[j];
 +                pd->PME_f_load_Av += pd->PME_f_load[j];
 +            }
 +            pd->Gcycles_Av    /= nrepeats;
 +            pd->PME_f_load_Av /= nrepeats;
 +
 +            for (j = 0; j < nrepeats; j++)
 +            {
 +                if (pd->ns_per_day[j] > 0.0)
 +                {
 +                    pd->ns_per_day_Av += pd->ns_per_day[j];
 +                }
 +                else
 +                {
 +                    /* Somehow the performance number was not aquired for this run,
 +                     * therefor set the average to some negative value: */
 +                    pd->ns_per_day_Av = -1.0f*nrepeats;
 +                    break;
 +                }
 +            }
 +            pd->ns_per_day_Av /= nrepeats;
 +
 +            /* Nicer output: */
 +            if (pd->PME_f_load_Av > 0.0)
 +            {
 +                sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 +            }
 +            else
 +            {
 +                sprintf(str_PME_f_load, "%s", "         -  ");
 +            }
 +
 +
 +            /* We assume we had a successful run if both averages are positive */
 +            if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 +            {
 +                /* Output statistics if repeats were done */
 +                if (nrepeats > 1)
 +                {
 +                    /* Calculate the standard deviation */
 +                    s = 0.0;
 +                    for (j = 0; j < nrepeats; j++)
 +                    {
 +                        s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 +                    }
 +                    s /= (nrepeats - 1);
 +                    s  = sqrt(s);
 +
 +                    fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 +                            line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 +                            pd->ns_per_day_Av, str_PME_f_load);
 +                    if (nnodes > 1)
 +                    {
 +                        fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 +                    }
 +                    fprintf(fp, "\n");
 +                }
 +                /* Store the index of the best run found so far in 'winner': */
 +                if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 +                {
 +                    k_win    = k;
 +                    i_win    = i;
 +                    line_win = line;
 +                }
 +                line++;
 +            }
 +        }
 +    }
 +
 +    if (k_win == -1)
 +    {
 +        gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 +    }
 +
 +    sep_line(fp);
 +
 +    winPME = perfdata[k_win][i_win].nPMEnodes;
 +
 +    if (1 == ntests)
 +    {
 +        /* We stuck to a fixed number of PME-only nodes */
 +        sprintf(strbuf, "settings No. %d", k_win);
 +    }
 +    else
 +    {
 +        /* We have optimized the number of PME-only nodes */
 +        if (winPME == -1)
 +        {
 +            sprintf(strbuf, "%s", "the automatic number of PME nodes");
 +        }
 +        else
 +        {
 +            sprintf(strbuf, "%d PME nodes", winPME);
 +        }
 +    }
 +    fprintf(fp, "Best performance was achieved with %s", strbuf);
 +    if ((nrepeats > 1) && (ntests > 1))
 +    {
 +        fprintf(fp, " (see line %d)", line_win);
 +    }
 +    fprintf(fp, "\n");
 +
 +    /* Only mention settings if they were modified: */
 +    bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 +    bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 +    bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 +                     info->nky[k_win] == info->nky[0] &&
 +                     info->nkz[k_win] == info->nkz[0]);
 +
 +    if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 +    {
 +        fprintf(fp, "Optimized PME settings:\n");
 +        bCanUseOrigTPR = FALSE;
 +    }
 +    else
 +    {
 +        bCanUseOrigTPR = TRUE;
 +    }
 +
 +    if (bRefinedCoul)
 +    {
 +        fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 +    }
 +
 +    if (bRefinedVdW)
 +    {
 +        fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 +    }
 +
 +    if (bRefinedGrid)
 +    {
 +        fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 +                info->nkx[0], info->nky[0], info->nkz[0]);
 +    }
 +
 +    if (bCanUseOrigTPR && ntprs > 1)
 +    {
 +        fprintf(fp, "and original PME settings.\n");
 +    }
 +
 +    fflush(fp);
 +
 +    /* Return the index of the mdp file that showed the highest performance
 +     * and the optimal number of PME nodes */
 +    *index_tpr    = k_win;
 +    *npme_optimal = winPME;
 +
 +    return bCanUseOrigTPR;
 +}
 +
 +
 +/* Get the commands we need to set up the runs from environment variables */
 +static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char cmd_np[],
 +                              char *cmd_mdrun[], int repeats)
 +{
 +    char      *cp;
 +    FILE      *fp;
 +    const char def_mpirun[]   = "mpirun";
 +    const char def_mdrun[]    = "mdrun";
 +
 +    const char empty_mpirun[] = "";
 +
 +    /* Get the commands we need to set up the runs from environment variables */
 +    if (!bThreads)
 +    {
 +        if ( (cp = getenv("MPIRUN")) != NULL)
 +        {
 +            *cmd_mpirun = strdup(cp);
 +        }
 +        else
 +        {
 +            *cmd_mpirun = strdup(def_mpirun);
 +        }
 +    }
 +    else
 +    {
 +        *cmd_mpirun = strdup(empty_mpirun);
 +    }
 +
 +    if ( (cp = getenv("MDRUN" )) != NULL)
 +    {
 +        *cmd_mdrun  = strdup(cp);
 +    }
 +    else
 +    {
 +        *cmd_mdrun  = strdup(def_mdrun);
 +    }
 +}
 +
 +/* Check that the commands will run mdrun (perhaps via mpirun) by
 + * running a very quick test simulation. Requires MPI environment to
 + * be available if applicable. */
 +static void check_mdrun_works(gmx_bool bThreads,
 +                              const char *cmd_mpirun,
 +                              const char *cmd_np,
 +                              const char *cmd_mdrun)
 +{
 +    char      *command = NULL;
 +    char      *cp;
 +    char       line[STRLEN];
 +    FILE      *fp;
 +    const char filename[]     = "benchtest.log";
 +
 +    /* This string should always be identical to the one in copyrite.c,
 +     * gmx_print_version_info() in the defined(GMX_MPI) section */
 +    const char match_mpi[]    = "MPI library:        MPI";
 +    const char match_mdrun[]  = "Program: ";
 +    gmx_bool   bMdrun         = FALSE;
 +    gmx_bool   bMPI           = FALSE;
 +
 +    /* Run a small test to see whether mpirun + mdrun work  */
 +    fprintf(stdout, "Making sure that mdrun can be executed. ");
 +    if (bThreads)
 +    {
 +        snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 +        sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 +    }
 +    else
 +    {
 +        snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 +        sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 +    }
 +    fprintf(stdout, "Trying '%s' ... ", command);
 +    make_backup(filename);
 +    gmx_system_call(command);
 +
 +    /* Check if we find the characteristic string in the output: */
 +    if (!gmx_fexist(filename))
 +    {
 +        gmx_fatal(FARGS, "Output from test run could not be found.");
 +    }
 +
 +    fp = fopen(filename, "r");
 +    /* We need to scan the whole output file, since sometimes the queuing system
 +     * also writes stuff to stdout/err */
 +    while (!feof(fp) )
 +    {
 +        cp = fgets(line, STRLEN, fp);
 +        if (cp != NULL)
 +        {
 +            if (str_starts(line, match_mdrun) )
 +            {
 +                bMdrun = TRUE;
 +            }
 +            if (str_starts(line, match_mpi) )
 +            {
 +                bMPI = TRUE;
 +            }
 +        }
 +    }
 +    fclose(fp);
 +
 +    if (bThreads)
 +    {
 +        if (bMPI)
 +        {
 +            gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 +                      "(%s)\n"
 +                      "seems to have been compiled with MPI instead.",
 +                      *cmd_mdrun);
 +        }
 +    }
 +    else
 +    {
 +        if (bMdrun && !bMPI)
 +        {
 +            gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 +                      "(%s)\n"
 +                      "seems to have been compiled without MPI support.",
 +                      *cmd_mdrun);
 +        }
 +    }
 +
 +    if (!bMdrun)
 +    {
 +        gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 +                  filename);
 +    }
 +
 +    fprintf(stdout, "passed.\n");
 +
 +    /* Clean up ... */
 +    remove(filename);
 +    sfree(command);
 +}
 +
 +
 +static void launch_simulation(
 +        gmx_bool    bLaunch,        /* Should the simulation be launched? */
 +        FILE       *fp,             /* General log file */
 +        gmx_bool    bThreads,       /* whether to use threads */
 +        char       *cmd_mpirun,     /* Command for mpirun */
 +        char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 +        char       *cmd_mdrun,      /* Command for mdrun */
 +        char       *args_for_mdrun, /* Arguments for mdrun */
 +        const char *simulation_tpr, /* This tpr will be simulated */
 +        int         nPMEnodes)      /* Number of PME nodes to use */
 +{
 +    char  *command;
 +
 +
 +    /* Make enough space for the system call command,
 +     * (100 extra chars for -npme ... etc. options should suffice): */
 +    snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 +
 +    /* Note that the -passall options requires args_for_mdrun to be at the end
 +     * of the command line string */
 +    if (bThreads)
 +    {
 +        sprintf(command, "%s%s-npme %d -s %s %s",
 +                cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 +    }
 +    else
 +    {
 +        sprintf(command, "%s%s%s -npme %d -s %s %s",
 +                cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 +    }
 +
 +    fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 +    sep_line(fp);
 +    fflush(fp);
 +
 +    /* Now the real thing! */
 +    if (bLaunch)
 +    {
 +        fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 +        sep_line(stdout);
 +        fflush(stdout);
 +        gmx_system_call(command);
 +    }
 +}
 +
 +
 +static void modify_PMEsettings(
 +        gmx_large_int_t simsteps,    /* Set this value as number of time steps */
 +        gmx_large_int_t init_step,   /* Set this value as init_step */
 +        const char     *fn_best_tpr, /* tpr file with the best performance */
 +        const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 +{
 +    t_inputrec   *ir;
 +    t_state       state;
 +    gmx_mtop_t    mtop;
 +    char          buf[200];
 +
 +    snew(ir, 1);
 +    read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 +
 +    /* Reset nsteps and init_step to the value of the input .tpr file */
 +    ir->nsteps    = simsteps;
 +    ir->init_step = init_step;
 +
 +    /* Write the tpr file which will be launched */
 +    sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 +    fprintf(stdout, buf, ir->nsteps);
 +    fflush(stdout);
 +    write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 +
 +    sfree(ir);
 +}
 +
 +
 +#define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 +
 +/* Make additional TPR files with more computational load for the
 + * direct space processors: */
 +static void make_benchmark_tprs(
 +        const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 +        char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 +        gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
 +        gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
 +        real            rmin,            /* Minimal Coulomb radius                        */
 +        real            rmax,            /* Maximal Coulomb radius                        */
 +        real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 +        int            *ntprs,           /* No. of TPRs to write, each with a different
 +                                            rcoulomb and fourierspacing                   */
 +        t_inputinfo    *info,            /* Contains information about mdp file options   */
 +        FILE           *fp)              /* Write the output here                         */
 +{
 +    int           i, j, d;
 +    t_inputrec   *ir;
 +    t_state       state;
 +    gmx_mtop_t    mtop;
 +    real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 +    char          buf[200];
 +    rvec          box_size;
 +    gmx_bool      bNote = FALSE;
 +    real          add;              /* Add this to rcoul for the next test    */
 +    real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 +    real          fourierspacing;   /* Basic fourierspacing from tpr          */
 +
 +
 +    sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 +            *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
 +    fprintf(stdout, buf, benchsteps);
 +    if (statesteps > 0)
 +    {
 +        sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 +        fprintf(stdout, buf, statesteps);
 +        benchsteps += statesteps;
 +    }
 +    fprintf(stdout, ".\n");
 +
 +
 +    snew(ir, 1);
 +    read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 +
 +    /* Check if some kind of PME was chosen */
 +    if (EEL_PME(ir->coulombtype) == FALSE)
 +    {
 +        gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 +                  EELTYPE(eelPME));
 +    }
 +
 +    /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 +    if (  (ir->cutoff_scheme != ecutsVERLET) &&
 +          (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 +    {
 +        gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 +                  EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 +    }
 +    /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 +    else if (ir->rcoulomb > ir->rlist)
 +    {
 +        gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 +                  EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 +    }
 +
 +    if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 +    {
 +        fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 +        bScaleRvdw = FALSE;
 +    }
 +
 +    /* Reduce the number of steps for the benchmarks */
 +    info->orig_sim_steps = ir->nsteps;
 +    ir->nsteps           = benchsteps;
 +    /* We must not use init_step from the input tpr file for the benchmarks */
 +    info->orig_init_step = ir->init_step;
 +    ir->init_step        = 0;
 +
 +    /* For PME-switch potentials, keep the radial distance of the buffer region */
 +    nlist_buffer   = ir->rlist - ir->rcoulomb;
 +
 +    /* Determine length of triclinic box vectors */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        box_size[d] = 0;
 +        for (i = 0; i < DIM; i++)
 +        {
 +            box_size[d] += state.box[d][i]*state.box[d][i];
 +        }
 +        box_size[d] = sqrt(box_size[d]);
 +    }
 +
 +    if (ir->fourier_spacing > 0)
 +    {
 +        info->fsx[0] = ir->fourier_spacing;
 +        info->fsy[0] = ir->fourier_spacing;
 +        info->fsz[0] = ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 +        info->fsx[0] = box_size[XX]/ir->nkx;
 +        info->fsy[0] = box_size[YY]/ir->nky;
 +        info->fsz[0] = box_size[ZZ]/ir->nkz;
 +    }
 +
 +    /* If no value for the fourierspacing was provided on the command line, we
 +     * use the reconstruction from the tpr file */
 +    if (ir->fourier_spacing > 0)
 +    {
 +        /* Use the spacing from the tpr */
 +        fourierspacing = ir->fourier_spacing;
 +    }
 +    else
 +    {
 +        /* Use the maximum observed spacing */
 +        fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 +    }
 +
 +    fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 +
 +    /* For performance comparisons the number of particles is useful to have */
 +    fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 +
 +    /* Print information about settings of which some are potentially modified: */
 +    fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 +    fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 +            box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 +    fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 +    if (EVDW_SWITCHED(ir->vdwtype))
 +    {
 +        fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 +    }
 +    if (EPME_SWITCHED(ir->coulombtype))
 +    {
 +        fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 +    }
 +    if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 +    {
 +        fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 +    }
 +
 +    /* Print a descriptive line about the tpr settings tested */
 +    fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 +    fprintf(fp, " No.   scaling  rcoulomb");
 +    fprintf(fp, "  nkx  nky  nkz");
 +    fprintf(fp, "   spacing");
 +    if (evdwCUT == ir->vdwtype)
 +    {
 +        fprintf(fp, "      rvdw");
 +    }
 +    if (EPME_SWITCHED(ir->coulombtype))
 +    {
 +        fprintf(fp, "     rlist");
 +    }
 +    if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 +    {
 +        fprintf(fp, " rlistlong");
 +    }
 +    fprintf(fp, "  tpr file\n");
 +
 +    /* Loop to create the requested number of tpr input files */
 +    for (j = 0; j < *ntprs; j++)
 +    {
 +        /* The first .tpr is the provided one, just need to modify nsteps,
 +         * so skip the following block */
 +        if (j != 0)
 +        {
 +            /* Determine which Coulomb radii rc to use in the benchmarks */
 +            add = (rmax-rmin)/(*ntprs-1);
 +            if (is_equal(rmin, info->rcoulomb[0]))
 +            {
 +                ir->rcoulomb = rmin + j*add;
 +            }
 +            else if (is_equal(rmax, info->rcoulomb[0]))
 +            {
 +                ir->rcoulomb = rmin + (j-1)*add;
 +            }
 +            else
 +            {
 +                /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 +                add          = (rmax-rmin)/(*ntprs-2);
 +                ir->rcoulomb = rmin + (j-1)*add;
 +            }
 +
 +            /* Determine the scaling factor fac */
 +            fac = ir->rcoulomb/info->rcoulomb[0];
 +
 +            /* Scale the Fourier grid spacing */
 +            ir->nkx = ir->nky = ir->nkz = 0;
 +            calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 +
 +            /* Adjust other radii since various conditions neet to be fulfilled */
 +            if (eelPME == ir->coulombtype)
 +            {
 +                /* plain PME, rcoulomb must be equal to rlist */
 +                ir->rlist = ir->rcoulomb;
 +            }
 +            else
 +            {
 +                /* rlist must be >= rcoulomb, we keep the size of the buffer region */
 +                ir->rlist = ir->rcoulomb + nlist_buffer;
 +            }
 +
 +            if (bScaleRvdw && evdwCUT == ir->vdwtype)
 +            {
 +                /* For vdw cutoff, rvdw >= rlist */
 +                ir->rvdw = max(info->rvdw[0], ir->rlist);
 +            }
 +
 +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
 +
 +        } /* end of "if (j != 0)" */
 +
 +        /* for j==0: Save the original settings
 +         * for j >0: Save modified radii and Fourier grids */
 +        info->rcoulomb[j]  = ir->rcoulomb;
 +        info->rvdw[j]      = ir->rvdw;
 +        info->nkx[j]       = ir->nkx;
 +        info->nky[j]       = ir->nky;
 +        info->nkz[j]       = ir->nkz;
 +        info->rlist[j]     = ir->rlist;
 +        info->rlistlong[j] = ir->rlistlong;
 +        info->fsx[j]       = fac*fourierspacing;
 +        info->fsy[j]       = fac*fourierspacing;
 +        info->fsz[j]       = fac*fourierspacing;
 +
 +        /* Write the benchmark tpr file */
 +        strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
 +        sprintf(buf, "_bench%.2d.tpr", j);
 +        strcat(fn_bench_tprs[j], buf);
 +        fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
 +        fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
 +        if (j > 0)
 +        {
 +            fprintf(stdout, ", scaling factor %f\n", fac);
 +        }
 +        else
 +        {
 +            fprintf(stdout, ", unmodified settings\n");
 +        }
 +
 +        write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
 +
 +        /* Write information about modified tpr settings to log file */
 +        fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
 +        fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
 +        fprintf(fp, " %9f ", info->fsx[j]);
 +        if (evdwCUT == ir->vdwtype)
 +        {
 +            fprintf(fp, "%10f", ir->rvdw);
 +        }
 +        if (EPME_SWITCHED(ir->coulombtype))
 +        {
 +            fprintf(fp, "%10f", ir->rlist);
 +        }
 +        if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
 +        {
 +            fprintf(fp, "%10f", ir->rlistlong);
 +        }
 +        fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
 +
 +        /* Make it clear to the user that some additional settings were modified */
 +        if (!is_equal(ir->rvdw, info->rvdw[0])
 +            || !is_equal(ir->rlistlong, info->rlistlong[0]) )
 +        {
 +            bNote = TRUE;
 +        }
 +    }
 +    if (bNote)
 +    {
 +        fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
 +                "other input settings were also changed (see table above).\n"
 +                "Please check if the modified settings are appropriate.\n");
 +    }
 +    fflush(stdout);
 +    fflush(fp);
 +    sfree(ir);
 +}
 +
 +
 +/* Rename the files we want to keep to some meaningful filename and
 + * delete the rest */
 +static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
 +                    int nPMEnodes, int nr, gmx_bool bKeepStderr)
 +{
 +    char        numstring[STRLEN];
 +    char        newfilename[STRLEN];
 +    const char *fn = NULL;
 +    int         i;
 +    const char *opt;
 +
 +
 +    fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
 +
 +    for (i = 0; i < nfile; i++)
 +    {
 +        opt = (char *)fnm[i].opt;
 +        if (strcmp(opt, "-p") == 0)
 +        {
 +            /* do nothing; keep this file */
 +            ;
 +        }
 +        else if (strcmp(opt, "-bg") == 0)
 +        {
 +            /* Give the log file a nice name so one can later see which parameters were used */
 +            numstring[0] = '\0';
 +            if (nr > 0)
 +            {
 +                sprintf(numstring, "_%d", nr);
 +            }
 +            sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
 +            if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
 +            {
 +                fprintf(stdout, "renaming log file to %s\n", newfilename);
 +                make_backup(newfilename);
 +                rename(opt2fn("-bg", nfile, fnm), newfilename);
 +            }
 +        }
 +        else if (strcmp(opt, "-err") == 0)
 +        {
 +            /* This file contains the output of stderr. We want to keep it in
 +             * cases where there have been problems. */
 +            fn           = opt2fn(opt, nfile, fnm);
 +            numstring[0] = '\0';
 +            if (nr > 0)
 +            {
 +                sprintf(numstring, "_%d", nr);
 +            }
 +            sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
 +            if (gmx_fexist(fn))
 +            {
 +                if (bKeepStderr)
 +                {
 +                    fprintf(stdout, "Saving stderr output in %s\n", newfilename);
 +                    make_backup(newfilename);
 +                    rename(fn, newfilename);
 +                }
 +                else
 +                {
 +                    fprintf(stdout, "Deleting %s\n", fn);
 +                    remove(fn);
 +                }
 +            }
 +        }
 +        /* Delete the files which are created for each benchmark run: (options -b*) */
 +        else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
 +        {
 +            fn = opt2fn(opt, nfile, fnm);
 +            if (gmx_fexist(fn))
 +            {
 +                fprintf(stdout, "Deleting %s\n", fn);
 +                remove(fn);
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* Returns the largest common factor of n1 and n2 */
 +static int largest_common_factor(int n1, int n2)
 +{
 +    int factor, nmax;
 +
 +    nmax = min(n1, n2);
 +    for (factor = nmax; factor > 0; factor--)
 +    {
 +        if (0 == (n1 % factor) && 0 == (n2 % factor) )
 +        {
 +            return(factor);
 +        }
 +    }
 +    return 0; /* one for the compiler */
 +}
 +
 +enum {
 +    eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
 +};
 +
 +/* Create a list of numbers of PME nodes to test */
 +static void make_npme_list(
 +        const char *npmevalues_opt, /* Make a complete list with all
 +                                     * possibilities or a short list that keeps only
 +                                     * reasonable numbers of PME nodes                  */
 +        int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
 +        int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
 +        int         nnodes,         /* Total number of nodes to do the tests on         */
 +        int         minPMEnodes,    /* Minimum number of PME nodes                      */
 +        int         maxPMEnodes)    /* Maximum number of PME nodes                      */
 +{
 +    int i, npme, npp;
 +    int min_factor = 1;   /* We request that npp and npme have this minimal
 +                           * largest common factor (depends on npp)           */
 +    int nlistmax;         /* Max. list size                                   */
 +    int nlist;            /* Actual number of entries in list                 */
 +    int eNPME = 0;
 +
 +
 +    /* Do we need to check all possible values for -npme or is a reduced list enough? */
 +    if (0 == strcmp(npmevalues_opt, "all") )
 +    {
 +        eNPME = eNpmeAll;
 +    }
 +    else if (0 == strcmp(npmevalues_opt, "subset") )
 +    {
 +        eNPME = eNpmeSubset;
 +    }
 +    else /* "auto" or "range" */
 +    {
 +        if (nnodes <= 64)
 +        {
 +            eNPME = eNpmeAll;
 +        }
 +        else if (nnodes < 128)
 +        {
 +            eNPME = eNpmeReduced;
 +        }
 +        else
 +        {
 +            eNPME = eNpmeSubset;
 +        }
 +    }
 +
 +    /* Calculate how many entries we could possibly have (in case of -npme all) */
 +    if (nnodes > 2)
 +    {
 +        nlistmax = maxPMEnodes - minPMEnodes + 3;
 +        if (0 == minPMEnodes)
 +        {
 +            nlistmax--;
 +        }
 +    }
 +    else
 +    {
 +        nlistmax = 1;
 +    }
 +
 +    /* Now make the actual list which is at most of size nlist */
 +    snew(*nPMEnodes, nlistmax);
 +    nlist = 0; /* start counting again, now the real entries in the list */
 +    for (i = 0; i < nlistmax - 2; i++)
 +    {
 +        npme = maxPMEnodes - i;
 +        npp  = nnodes-npme;
 +        switch (eNPME)
 +        {
 +            case eNpmeAll:
 +                min_factor = 1;
 +                break;
 +            case eNpmeReduced:
 +                min_factor = 2;
 +                break;
 +            case eNpmeSubset:
 +                /* For 2d PME we want a common largest factor of at least the cube
 +                 * root of the number of PP nodes */
 +                min_factor = (int) pow(npp, 1.0/3.0);
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
 +                break;
 +        }
 +        if (largest_common_factor(npp, npme) >= min_factor)
 +        {
 +            (*nPMEnodes)[nlist] = npme;
 +            nlist++;
 +        }
 +    }
 +    /* We always test 0 PME nodes and the automatic number */
 +    *nentries             = nlist + 2;
 +    (*nPMEnodes)[nlist  ] =  0;
 +    (*nPMEnodes)[nlist+1] = -1;
 +
 +    fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
 +    for (i = 0; i < *nentries-1; i++)
 +    {
 +        fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
 +    }
 +    fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
 +}
 +
 +
 +/* Allocate memory to store the performance data */
 +static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
 +{
 +    int i, j, k;
 +
 +
 +    for (k = 0; k < ntprs; k++)
 +    {
 +        snew(perfdata[k], datasets);
 +        for (i = 0; i < datasets; i++)
 +        {
 +            for (j = 0; j < repeats; j++)
 +            {
 +                snew(perfdata[k][i].Gcycles, repeats);
 +                snew(perfdata[k][i].ns_per_day, repeats);
 +                snew(perfdata[k][i].PME_f_load, repeats);
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* Check for errors on mdrun -h */
 +static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
 +{
 +    char *command, *msg;
 +    int   ret;
 +
 +
 +    snew(command, length +  15);
 +    snew(msg, length + 500);
 +
 +    fprintf(stdout, "Making sure the benchmarks can be executed ...\n");
 +    /* FIXME: mdrun -h no longer actually does anything useful.
 +     * It unconditionally prints the help, ignoring all other options. */
 +    sprintf(command, "%s-h -quiet", mdrun_cmd_line);
 +    ret = gmx_system_call(command);
 +
 +    if (0 != ret)
 +    {
 +        /* To prevent confusion, do not again issue a gmx_fatal here since we already
 +         * get the error message from mdrun itself */
 +        sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
 +                "mdrun for the source of the problem. Did you provide a command line\n"
 +                "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
 +                "\n%s\n\n", command);
 +
 +        fprintf(stderr, "%s", msg);
 +        sep_line(fp);
 +        fprintf(fp, "%s", msg);
 +
 +        exit(ret);
 +    }
 +
 +    sfree(command);
 +    sfree(msg    );
 +}
 +
 +
 +static void do_the_tests(
 +        FILE           *fp,             /* General g_tune_pme output file         */
 +        char          **tpr_names,      /* Filenames of the input files to test   */
 +        int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
 +        int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
 +        int             npme_fixed,     /* If >= -1, test fixed number of PME
 +                                         * nodes only                             */
 +        const char     *npmevalues_opt, /* Which -npme values should be tested    */
 +        t_perf        **perfdata,       /* Here the performace data is stored     */
 +        int            *pmeentries,     /* Entries in the nPMEnodes list          */
 +        int             repeats,        /* Repeat each test this often            */
 +        int             nnodes,         /* Total number of nodes = nPP + nPME     */
 +        int             nr_tprs,        /* Total number of tpr files to test      */
 +        gmx_bool        bThreads,       /* Threads or MPI?                        */
 +        char           *cmd_mpirun,     /* mpirun command string                  */
 +        char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
 +        char           *cmd_mdrun,      /* mdrun command string                   */
 +        char           *cmd_args_bench, /* arguments for mdrun in a string        */
 +        const t_filenm *fnm,            /* List of filenames from command line    */
 +        int             nfile,          /* Number of files specified on the cmdl. */
 +        int             presteps,       /* DLB equilibration steps, is checked    */
 +        gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
 +{
 +    int      i, nr, k, ret, count = 0, totaltests;
 +    int     *nPMEnodes = NULL;
 +    t_perf  *pd        = NULL;
 +    int      cmdline_length;
 +    char    *command, *cmd_stub;
 +    char     buf[STRLEN];
 +    gmx_bool bResetProblem = FALSE;
 +    gmx_bool bFirst        = TRUE;
 +
 +
 +    /* This string array corresponds to the eParselog enum type at the start
 +     * of this file */
 +    const char* ParseLog[] = {
 +        "OK.",
 +        "Logfile not found!",
 +        "No timings, logfile truncated?",
 +        "Run was terminated.",
 +        "Counters were not reset properly.",
 +        "No DD grid found for these settings.",
 +        "TPX version conflict!",
 +        "mdrun was not started in parallel!",
++        "Number of PP nodes has a prime factor that is too large.",
 +        "An error occured."
 +    };
 +    char        str_PME_f_load[13];
 +
 +
 +    /* Allocate space for the mdrun command line. 100 extra characters should
 +       be more than enough for the -npme etcetera arguments */
 +    cmdline_length =  strlen(cmd_mpirun)
 +        + strlen(cmd_np)
 +        + strlen(cmd_mdrun)
 +        + strlen(cmd_args_bench)
 +        + strlen(tpr_names[0]) + 100;
 +    snew(command, cmdline_length);
 +    snew(cmd_stub, cmdline_length);
 +
 +    /* Construct the part of the command line that stays the same for all tests: */
 +    if (bThreads)
 +    {
 +        sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
 +    }
 +    else
 +    {
 +        sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
 +    }
 +
 +    /* Create a list of numbers of PME nodes to test */
 +    if (npme_fixed < -1)
 +    {
 +        make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
 +                       nnodes, minPMEnodes, maxPMEnodes);
 +    }
 +    else
 +    {
 +        *pmeentries  = 1;
 +        snew(nPMEnodes, 1);
 +        nPMEnodes[0] = npme_fixed;
 +        fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
 +    }
 +
 +    if (0 == repeats)
 +    {
 +        fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
 +        ffclose(fp);
 +        finalize(opt2fn("-p", nfile, fnm));
 +        exit(0);
 +    }
 +
 +    /* Allocate one dataset for each tpr input file: */
 +    init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
 +
 +    /*****************************************/
 +    /* Main loop over all tpr files to test: */
 +    /*****************************************/
 +    totaltests = nr_tprs*(*pmeentries)*repeats;
 +    for (k = 0; k < nr_tprs; k++)
 +    {
 +        fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
 +        fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
 +        /* Loop over various numbers of PME nodes: */
 +        for (i = 0; i < *pmeentries; i++)
 +        {
 +            pd = &perfdata[k][i];
 +
 +            /* Loop over the repeats for each scenario: */
 +            for (nr = 0; nr < repeats; nr++)
 +            {
 +                pd->nPMEnodes = nPMEnodes[i];
 +
 +                /* Add -npme and -s to the command line and save it. Note that
 +                 * the -passall (if set) options requires cmd_args_bench to be
 +                 * at the end of the command line string */
 +                snew(pd->mdrun_cmd_line, cmdline_length);
 +                sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
 +                        cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
 +
 +                /* To prevent that all benchmarks fail due to a show-stopper argument
 +                 * on the mdrun command line, we make a quick check with mdrun -h first */
 +                if (bFirst)
 +                {
 +                    make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
 +                }
 +                bFirst = FALSE;
 +
 +                /* Do a benchmark simulation: */
 +                if (repeats > 1)
 +                {
 +                    sprintf(buf, ", pass %d/%d", nr+1, repeats);
 +                }
 +                else
 +                {
 +                    buf[0] = '\0';
 +                }
 +                fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
 +                        (100.0*count)/totaltests,
 +                        k+1, nr_tprs, i+1, *pmeentries, buf);
 +                make_backup(opt2fn("-err", nfile, fnm));
 +                sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
 +                fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
 +                gmx_system_call(command);
 +
 +                /* Collect the performance data from the log file; also check stderr
 +                 * for fatal errors */
 +                ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
 +                                    pd, nr, presteps, cpt_steps, nnodes);
 +                if ((presteps > 0) && (ret == eParselogResetProblem))
 +                {
 +                    bResetProblem = TRUE;
 +                }
 +
 +                if (-1 == pd->nPMEnodes)
 +                {
 +                    sprintf(buf, "(%3d)", pd->guessPME);
 +                }
 +                else
 +                {
 +                    sprintf(buf, "     ");
 +                }
 +
 +                /* Nicer output */
 +                if (pd->PME_f_load[nr] > 0.0)
 +                {
 +                    sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
 +                }
 +                else
 +                {
 +                    sprintf(str_PME_f_load, "%s", "         -  ");
 +                }
 +
 +                /* Write the data we got to disk */
 +                fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
 +                        buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
 +                if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
 +                {
 +                    fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
 +                }
 +                fprintf(fp, "\n");
 +                fflush(fp);
 +                count++;
 +
 +                /* Do some cleaning up and delete the files we do not need any more */
 +                cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
 +
 +                /* If the first run with this number of processors already failed, do not try again: */
 +                if (pd->Gcycles[0] <= 0.0 && repeats > 1)
 +                {
 +                    fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
 +                    count += repeats-(nr+1);
 +                    break;
 +                }
 +            } /* end of repeats loop */
 +        }     /* end of -npme loop */
 +    }         /* end of tpr file loop */
 +
 +    if (bResetProblem)
 +    {
 +        sep_line(fp);
 +        fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
 +        sep_line(fp);
 +    }
 +    sfree(command);
 +    sfree(cmd_stub);
 +}
 +
 +
 +static void check_input(
 +        int             nnodes,
 +        int             repeats,
 +        int            *ntprs,
 +        real           *rmin,
 +        real            rcoulomb,
 +        real           *rmax,
 +        real            maxPMEfraction,
 +        real            minPMEfraction,
 +        int             npme_fixed,
 +        gmx_large_int_t bench_nsteps,
 +        const t_filenm *fnm,
 +        int             nfile,
 +        int             sim_part,
 +        int             presteps,
 +        int             npargs,
 +        t_pargs        *pa)
 +{
 +    int old;
 +
 +
 +    /* Make sure the input file exists */
 +    if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
 +    {
 +        gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
 +    }
 +
 +    /* Make sure that the checkpoint file is not overwritten during benchmarking */
 +    if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
 +    {
 +        gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
 +                  "The checkpoint input file must not be overwritten during the benchmarks.\n");
 +    }
 +
 +    /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
 +    if (repeats < 0)
 +    {
 +        gmx_fatal(FARGS, "Number of repeats < 0!");
 +    }
 +
 +    /* Check number of nodes */
 +    if (nnodes < 1)
 +    {
 +        gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
 +    }
 +
 +    /* Automatically choose -ntpr if not set */
 +    if (*ntprs < 1)
 +    {
 +        if (nnodes < 16)
 +        {
 +            *ntprs = 1;
 +        }
 +        else
 +        {
 +            *ntprs = 3;
 +            /* Set a reasonable scaling factor for rcoulomb */
 +            if (*rmax <= 0)
 +            {
 +                *rmax = rcoulomb * 1.2;
 +            }
 +        }
 +        fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
 +    }
 +    else
 +    {
 +        if (1 == *ntprs)
 +        {
 +            fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
 +        }
 +    }
 +
 +    /* Make shure that rmin <= rcoulomb <= rmax */
 +    if (*rmin <= 0)
 +    {
 +        *rmin = rcoulomb;
 +    }
 +    if (*rmax <= 0)
 +    {
 +        *rmax = rcoulomb;
 +    }
 +    if (!(*rmin <= *rmax) )
 +    {
 +        gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
 +                  "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
 +    }
 +    /* Add test scenarios if rmin or rmax were set */
 +    if (*ntprs <= 2)
 +    {
 +        if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
 +        {
 +            (*ntprs)++;
 +            fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
 +                    *rmin, *ntprs);
 +        }
 +        if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
 +        {
 +            (*ntprs)++;
 +            fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
 +                    *rmax, *ntprs);
 +        }
 +    }
 +    old = *ntprs;
 +    /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
 +    if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
 +    {
 +        *ntprs = max(*ntprs, 2);
 +    }
 +
 +    /* If both rmin, rmax are set, we need 3 tpr files at minimum */
 +    if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
 +    {
 +        *ntprs = max(*ntprs, 3);
 +    }
 +
 +    if (old != *ntprs)
 +    {
 +        fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
 +    }
 +
 +    if (*ntprs > 1)
 +    {
 +        if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
 +        {
 +            fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
 +                    "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
 +                    "with correspondingly adjusted PME grid settings\n");
 +            *ntprs = 1;
 +        }
 +    }
 +
 +    /* Check whether max and min fraction are within required values */
 +    if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
 +    {
 +        gmx_fatal(FARGS, "-max must be between 0 and 0.5");
 +    }
 +    if (minPMEfraction > 0.5 || minPMEfraction < 0)
 +    {
 +        gmx_fatal(FARGS, "-min must be between 0 and 0.5");
 +    }
 +    if (maxPMEfraction < minPMEfraction)
 +    {
 +        gmx_fatal(FARGS, "-max must be larger or equal to -min");
 +    }
 +
 +    /* Check whether the number of steps - if it was set - has a reasonable value */
 +    if (bench_nsteps < 0)
 +    {
 +        gmx_fatal(FARGS, "Number of steps must be positive.");
 +    }
 +
 +    if (bench_nsteps > 10000 || bench_nsteps < 100)
 +    {
 +        fprintf(stderr, "WARNING: steps=");
 +        fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
 +        fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
 +    }
 +
 +    if (presteps < 0)
 +    {
 +        gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
 +    }
 +
 +    /* Check for rcoulomb scaling if more than one .tpr file is tested */
 +    if (*ntprs > 1)
 +    {
 +        if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
 +        {
 +            fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
 +        }
 +    }
 +
 +    /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
 +     * only. We need to check whether the requested number of PME-only nodes
 +     * makes sense. */
 +    if (npme_fixed > -1)
 +    {
 +        /* No more than 50% of all nodes can be assigned as PME-only nodes. */
 +        if (2*npme_fixed > nnodes)
 +        {
 +            gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
 +                      nnodes/2, nnodes, npme_fixed);
 +        }
 +        if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
 +        {
 +            fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
 +                    100.0*((real)npme_fixed / (real)nnodes));
 +        }
 +        if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
 +        {
 +            fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
 +                    "      fixed number of PME-only nodes is requested with -fix.\n");
 +        }
 +    }
 +}
 +
 +
 +/* Returns TRUE when "opt" is needed at launch time */
 +static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
 +{
 +    /* Apart from the input .tpr and the output log files we need all options that
 +     * were set on the command line and that do not start with -b */
 +    if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
 +           || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
 +    {
 +        return FALSE;
 +    }
 +
 +    return bSet;
 +}
 +
 +
 +/* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
 +static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
 +{
 +    /* Apart from the input .tpr, all files starting with "-b" are for
 +     * _b_enchmark files exclusively */
 +    if (0 == strncmp(opt, "-s", 2))
 +    {
 +        return FALSE;
 +    }
 +
 +    if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
 +    {
 +        if (!bOptional || bSet)
 +        {
 +            return TRUE;
 +        }
 +        else
 +        {
 +            return FALSE;
 +        }
 +    }
 +    else
 +    {
 +        if (bIsOutput)
 +        {
 +            return FALSE;
 +        }
 +        else
 +        {
 +            if (bSet) /* These are additional input files like -cpi -ei */
 +            {
 +                return TRUE;
 +            }
 +            else
 +            {
 +                return FALSE;
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* Adds 'buf' to 'str' */
 +static void add_to_string(char **str, char *buf)
 +{
 +    int len;
 +
 +
 +    len = strlen(*str) + strlen(buf) + 1;
 +    srenew(*str, len);
 +    strcat(*str, buf);
 +}
 +
 +
 +/* Create the command line for the benchmark as well as for the real run */
 +static void create_command_line_snippets(
 +        gmx_bool  bAppendFiles,
 +        gmx_bool  bKeepAndNumCPT,
 +        gmx_bool  bResetHWay,
 +        int       presteps,
 +        int       nfile,
 +        t_filenm  fnm[],
 +        char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
 +        char     *cmd_args_launch[], /* command line arguments for simulation run */
 +        char      extra_args[])      /* Add this to the end of the command line */
 +{
 +    int         i;
 +    char       *opt;
 +    const char *name;
 +    char        strbuf[STRLEN];
 +
 +
 +    /* strlen needs at least '\0' as a string: */
 +    snew(*cmd_args_bench, 1);
 +    snew(*cmd_args_launch, 1);
 +    *cmd_args_launch[0] = '\0';
 +    *cmd_args_bench[0]  = '\0';
 +
 +
 +    /*******************************************/
 +    /* 1. Process other command line arguments */
 +    /*******************************************/
 +    if (presteps > 0)
 +    {
 +        /* Add equilibration steps to benchmark options */
 +        sprintf(strbuf, "-resetstep %d ", presteps);
 +        add_to_string(cmd_args_bench, strbuf);
 +    }
 +    /* These switches take effect only at launch time */
 +    if (FALSE == bAppendFiles)
 +    {
 +        add_to_string(cmd_args_launch, "-noappend ");
 +    }
 +    if (bKeepAndNumCPT)
 +    {
 +        add_to_string(cmd_args_launch, "-cpnum ");
 +    }
 +    if (bResetHWay)
 +    {
 +        add_to_string(cmd_args_launch, "-resethway ");
 +    }
 +
 +    /********************/
 +    /* 2. Process files */
 +    /********************/
 +    for (i = 0; i < nfile; i++)
 +    {
 +        opt  = (char *)fnm[i].opt;
 +        name = opt2fn(opt, nfile, fnm);
 +
 +        /* Strbuf contains the options, now let's sort out where we need that */
 +        sprintf(strbuf, "%s %s ", opt, name);
 +
 +        if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
 +        {
 +            /* All options starting with -b* need the 'b' removed,
 +             * therefore overwrite strbuf */
 +            if (0 == strncmp(opt, "-b", 2))
 +            {
 +                sprintf(strbuf, "-%s %s ", &opt[2], name);
 +            }
 +
 +            add_to_string(cmd_args_bench, strbuf);
 +        }
 +
 +        if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
 +        {
 +            add_to_string(cmd_args_launch, strbuf);
 +        }
 +    }
 +
 +    add_to_string(cmd_args_bench, extra_args);
 +    add_to_string(cmd_args_launch, extra_args);
 +}
 +
 +
 +/* Set option opt */
 +static void setopt(const char *opt, int nfile, t_filenm fnm[])
 +{
 +    int i;
 +
 +    for (i = 0; (i < nfile); i++)
 +    {
 +        if (strcmp(opt, fnm[i].opt) == 0)
 +        {
 +            fnm[i].flag |= ffSET;
 +        }
 +    }
 +}
 +
 +
 +/* This routine inspects the tpr file and ...
 + * 1. checks for output files that get triggered by a tpr option. These output
 + *    files are marked as 'set' to allow for a proper cleanup after each
 + *    tuning run.
 + * 2. returns the PME:PP load ratio
 + * 3. returns rcoulomb from the tpr */
 +static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
 +{
 +    gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
 +    gmx_bool     bTpi;      /* Is test particle insertion requested?          */
 +    gmx_bool     bFree;     /* Is a free energy simulation requested?         */
 +    gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
 +    t_inputrec   ir;
 +    t_state      state;
 +    gmx_mtop_t   mtop;
 +
 +
 +    /* Check tpr file for options that trigger extra output files */
 +    read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
 +    bPull = (epullNO != ir.ePull);
 +    bFree = (efepNO  != ir.efep );
 +    bNM   = (eiNM    == ir.eI   );
 +    bTpi  = EI_TPI(ir.eI);
 +
 +    /* Set these output files on the tuning command-line */
 +    if (bPull)
 +    {
 +        setopt("-pf", nfile, fnm);
 +        setopt("-px", nfile, fnm);
 +    }
 +    if (bFree)
 +    {
 +        setopt("-dhdl", nfile, fnm);
 +    }
 +    if (bTpi)
 +    {
 +        setopt("-tpi", nfile, fnm);
 +        setopt("-tpid", nfile, fnm);
 +    }
 +    if (bNM)
 +    {
 +        setopt("-mtx", nfile, fnm);
 +    }
 +
 +    *rcoulomb = ir.rcoulomb;
 +
 +    /* Return the estimate for the number of PME nodes */
 +    return pme_load_estimate(&mtop, &ir, state.box);
 +}
 +
 +
 +static void couple_files_options(int nfile, t_filenm fnm[])
 +{
 +    int      i;
 +    gmx_bool bSet, bBench;
 +    char    *opt;
 +    char     buf[20];
 +
 +
 +    for (i = 0; i < nfile; i++)
 +    {
 +        opt    = (char *)fnm[i].opt;
 +        bSet   = ((fnm[i].flag & ffSET) != 0);
 +        bBench = (0 == strncmp(opt, "-b", 2));
 +
 +        /* Check optional files */
 +        /* If e.g. -eo is set, then -beo also needs to be set */
 +        if (is_optional(&fnm[i]) && bSet && !bBench)
 +        {
 +            sprintf(buf, "-b%s", &opt[1]);
 +            setopt(buf, nfile, fnm);
 +        }
 +        /* If -beo is set, then -eo also needs to be! */
 +        if (is_optional(&fnm[i]) && bSet && bBench)
 +        {
 +            sprintf(buf, "-%s", &opt[2]);
 +            setopt(buf, nfile, fnm);
 +        }
 +    }
 +}
 +
 +
 +static double gettime()
 +{
 +#ifdef HAVE_GETTIMEOFDAY
 +    struct timeval t;
 +    double         seconds;
 +
 +    gettimeofday(&t, NULL);
 +
 +    seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
 +
 +    return seconds;
 +#else
 +    double  seconds;
 +
 +    seconds = time(NULL);
 +
 +    return seconds;
 +#endif
 +}
 +
 +
 +#define BENCHSTEPS (1000)
 +
 +int gmx_tune_pme(int argc, char *argv[])
 +{
 +    const char     *desc[] = {
 +        "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
 +        "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
 +        "which setting is fastest. It will also test whether performance can",
 +        "be enhanced by shifting load from the reciprocal to the real space",
 +        "part of the Ewald sum. ",
 +        "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
 +        "for [TT]mdrun[tt] as needed.[PAR]",
 +        "Which executables are used can be set in the environment variables",
 +        "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
 +        "will be used as defaults. Note that for certain MPI frameworks you",
 +        "need to provide a machine- or hostfile. This can also be passed",
 +        "via the MPIRUN variable, e.g.[PAR]",
 +        "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
 +        "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
 +        "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
 +        "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
 +        "to repeat each test several times to get better statistics. [PAR]",
 +        "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
 +        "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
 +        "written with enlarged cutoffs and smaller Fourier grids respectively.",
 +        "Typically, the first test (number 0) will be with the settings from the input",
 +        "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
 +        "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
 +        "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
 +        "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
 +        "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
 +        "if you just seek the optimal number of PME-only nodes; in that case",
 +        "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
 +        "For the benchmark runs, the default of 1000 time steps should suffice for most",
 +        "MD systems. The dynamic load balancing needs about 100 time steps",
 +        "to adapt to local load imbalances, therefore the time step counters",
 +        "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
 +        "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
 +        "From the 'DD' load imbalance entries in the md.log output file you",
 +        "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
 +        "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
 +        "After calling [TT]mdrun[tt] several times, detailed performance information",
 +        "is available in the output file [TT]perf.out.[tt] ",
 +        "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
 +        "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
 +        "If you want the simulation to be started automatically with the",
 +        "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
 +    };
 +
 +    int             nnodes         = 1;
 +    int             repeats        = 2;
 +    int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
 +    real            maxPMEfraction = 0.50;
 +    real            minPMEfraction = 0.25;
 +    int             maxPMEnodes, minPMEnodes;
 +    float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
 +    float           guessPMEnodes;
 +    int             npme_fixed     = -2;              /* If >= -1, use only this number
 +                                                       * of PME-only nodes                */
 +    int             ntprs          = 0;
 +    real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
 +    real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
 +    gmx_bool        bScaleRvdw     = TRUE;
 +    gmx_large_int_t bench_nsteps   = BENCHSTEPS;
 +    gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
 +    gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
 +    int             presteps       = 100; /* Do a full cycle reset after presteps steps */
 +    gmx_bool        bOverwrite     = FALSE, bKeepTPR;
 +    gmx_bool        bLaunch        = FALSE;
 +    char           *ExtraArgs      = NULL;
 +    char          **tpr_names      = NULL;
 +    const char     *simulation_tpr = NULL;
 +    int             best_npme, best_tpr;
 +    int             sim_part = 1; /* For benchmarks with checkpoint files */
 +    char            bbuf[STRLEN];
 +
 +    /* Default program names if nothing else is found */
 +    char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
 +    char         *cmd_args_bench, *cmd_args_launch;
 +    char         *cmd_np = NULL;
 +
 +    t_perf      **perfdata = NULL;
 +    t_inputinfo  *info;
 +    int           i;
 +    FILE         *fp;
 +    t_commrec    *cr;
 +
 +    /* Print out how long the tuning took */
 +    double          seconds;
 +
 +    static t_filenm fnm[] = {
 +        /* g_tune_pme */
 +        { efOUT, "-p",      "perf",     ffWRITE },
 +        { efLOG, "-err",    "bencherr", ffWRITE },
 +        { efTPX, "-so",     "tuned",    ffWRITE },
 +        /* mdrun: */
 +        { efTPX, NULL,      NULL,       ffREAD },
 +        { efTRN, "-o",      NULL,       ffWRITE },
 +        { efXTC, "-x",      NULL,       ffOPTWR },
 +        { efCPT, "-cpi",    NULL,       ffOPTRD },
 +        { efCPT, "-cpo",    NULL,       ffOPTWR },
 +        { efSTO, "-c",      "confout",  ffWRITE },
 +        { efEDR, "-e",      "ener",     ffWRITE },
 +        { efLOG, "-g",      "md",       ffWRITE },
 +        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
 +        { efXVG, "-field",  "field",    ffOPTWR },
 +        { efXVG, "-table",  "table",    ffOPTRD },
 +        { efXVG, "-tabletf", "tabletf",   ffOPTRD },
 +        { efXVG, "-tablep", "tablep",   ffOPTRD },
 +        { efXVG, "-tableb", "table",    ffOPTRD },
 +        { efTRX, "-rerun",  "rerun",    ffOPTRD },
 +        { efXVG, "-tpi",    "tpi",      ffOPTWR },
 +        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
 +        { efEDI, "-ei",     "sam",      ffOPTRD },
 +        { efXVG, "-eo",     "edsam",    ffOPTWR },
 +        { efGCT, "-j",      "wham",     ffOPTRD },
 +        { efGCT, "-jo",     "bam",      ffOPTWR },
 +        { efXVG, "-ffout",  "gct",      ffOPTWR },
 +        { efXVG, "-devout", "deviatie", ffOPTWR },
 +        { efXVG, "-runav",  "runaver",  ffOPTWR },
 +        { efXVG, "-px",     "pullx",    ffOPTWR },
 +        { efXVG, "-pf",     "pullf",    ffOPTWR },
 +        { efXVG, "-ro",     "rotation", ffOPTWR },
 +        { efLOG, "-ra",     "rotangles", ffOPTWR },
 +        { efLOG, "-rs",     "rotslabs", ffOPTWR },
 +        { efLOG, "-rt",     "rottorque", ffOPTWR },
 +        { efMTX, "-mtx",    "nm",       ffOPTWR },
 +        { efNDX, "-dn",     "dipole",   ffOPTWR },
 +        /* Output files that are deleted after each benchmark run */
 +        { efTRN, "-bo",     "bench",    ffWRITE },
 +        { efXTC, "-bx",     "bench",    ffWRITE },
 +        { efCPT, "-bcpo",   "bench",    ffWRITE },
 +        { efSTO, "-bc",     "bench",    ffWRITE },
 +        { efEDR, "-be",     "bench",    ffWRITE },
 +        { efLOG, "-bg",     "bench",    ffWRITE },
 +        { efXVG, "-beo",    "benchedo", ffOPTWR },
 +        { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
 +        { efXVG, "-bfield", "benchfld", ffOPTWR },
 +        { efXVG, "-btpi",   "benchtpi", ffOPTWR },
 +        { efXVG, "-btpid",  "benchtpid", ffOPTWR },
 +        { efGCT, "-bjo",    "bench",    ffOPTWR },
 +        { efXVG, "-bffout", "benchgct", ffOPTWR },
 +        { efXVG, "-bdevout", "benchdev", ffOPTWR },
 +        { efXVG, "-brunav", "benchrnav", ffOPTWR },
 +        { efXVG, "-bpx",    "benchpx",  ffOPTWR },
 +        { efXVG, "-bpf",    "benchpf",  ffOPTWR },
 +        { efXVG, "-bro",    "benchrot", ffOPTWR },
 +        { efLOG, "-bra",    "benchrota", ffOPTWR },
 +        { efLOG, "-brs",    "benchrots", ffOPTWR },
 +        { efLOG, "-brt",    "benchrott", ffOPTWR },
 +        { efMTX, "-bmtx",   "benchn",   ffOPTWR },
 +        { efNDX, "-bdn",    "bench",    ffOPTWR }
 +    };
 +
 +    gmx_bool        bThreads     = FALSE;
 +
 +    int             nthreads = 1;
 +
 +    const char     *procstring[] =
 +    { NULL, "-np", "-n", "none", NULL };
 +    const char     *npmevalues_opt[] =
 +    { NULL, "auto", "all", "subset", NULL };
 +
 +    gmx_bool     bAppendFiles          = TRUE;
 +    gmx_bool     bKeepAndNumCPT        = FALSE;
 +    gmx_bool     bResetCountersHalfWay = FALSE;
 +    gmx_bool     bBenchmark            = TRUE;
 +
 +    output_env_t oenv = NULL;
 +
 +    t_pargs      pa[] = {
 +        /***********************/
 +        /* g_tune_pme options: */
 +        /***********************/
 +        { "-np",       FALSE, etINT,  {&nnodes},
 +          "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
 +        { "-npstring", FALSE, etENUM, {procstring},
 +          "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
 +        { "-ntmpi",    FALSE, etINT,  {&nthreads},
 +          "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
 +        { "-r",        FALSE, etINT,  {&repeats},
 +          "Repeat each test this often" },
 +        { "-max",      FALSE, etREAL, {&maxPMEfraction},
 +          "Max fraction of PME nodes to test with" },
 +        { "-min",      FALSE, etREAL, {&minPMEfraction},
 +          "Min fraction of PME nodes to test with" },
 +        { "-npme",     FALSE, etENUM, {npmevalues_opt},
 +          "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
 +          "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
 +        { "-fix",      FALSE, etINT,  {&npme_fixed},
 +          "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
 +        { "-rmax",     FALSE, etREAL, {&rmax},
 +          "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
 +        { "-rmin",     FALSE, etREAL, {&rmin},
 +          "If >0, minimal rcoulomb for -ntpr>1" },
 +        { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
 +          "Scale rvdw along with rcoulomb"},
 +        { "-ntpr",     FALSE, etINT,  {&ntprs},
 +          "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
 +          "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
 +        { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
 +          "Take timings for this many steps in the benchmark runs" },
 +        { "-resetstep", FALSE, etINT,  {&presteps},
 +          "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
 +        { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
 +          "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
 +        { "-launch",   FALSE, etBOOL, {&bLaunch},
 +          "Launch the real simulation after optimization" },
 +        { "-bench",    FALSE, etBOOL, {&bBenchmark},
 +          "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
 +        /******************/
 +        /* mdrun options: */
 +        /******************/
 +        /* We let g_tune_pme parse and understand these options, because we need to
 +         * prevent that they appear on the mdrun command line for the benchmarks */
 +        { "-append",   FALSE, etBOOL, {&bAppendFiles},
 +          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
 +        { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
 +          "Keep and number checkpoint files (launch only)" },
 +        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
 +          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
 +    };
 +
 +#define NFILE asize(fnm)
 +
 +    seconds = gettime();
 +
 +    if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
 +                           NFILE, fnm, asize(pa), pa, asize(desc), desc,
 +                           0, NULL, &oenv))
 +    {
 +        return 0;
 +    }
 +
 +    /* Store the remaining unparsed command line entries in a string which
 +     * is then attached to the mdrun command line */
 +    snew(ExtraArgs, 1);
 +    ExtraArgs[0] = '\0';
 +    for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
 +    {
 +        add_to_string(&ExtraArgs, argv[i]);
 +        add_to_string(&ExtraArgs, " ");
 +    }
 +
 +    if (opt2parg_bSet("-ntmpi", asize(pa), pa))
 +    {
 +        bThreads = TRUE;
 +        if (opt2parg_bSet("-npstring", asize(pa), pa))
 +        {
 +            fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
 +        }
 +
 +        if (nnodes > 1)
 +        {
 +            gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
 +        }
 +        /* and now we just set this; a bit of an ugly hack*/
 +        nnodes = nthreads;
 +    }
 +    /* Check for PME:PP ratio and whether tpr triggers additional output files */
 +    guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
 +
 +    /* Automatically set -beo options if -eo is set etc. */
 +    couple_files_options(NFILE, fnm);
 +
 +    /* Construct the command line arguments for benchmark runs
 +     * as well as for the simulation run */
 +    if (bThreads)
 +    {
 +        sprintf(bbuf, " -ntmpi %d ", nthreads);
 +    }
 +    else
 +    {
 +        /* This string will be used for MPI runs and will appear after the
 +         * mpirun command. */
 +        if (strcmp(procstring[0], "none") != 0)
 +        {
 +            sprintf(bbuf, " %s %d ", procstring[0], nnodes);
 +        }
 +        else
 +        {
 +            sprintf(bbuf, " ");
 +        }
 +    }
 +
 +    cmd_np = bbuf;
 +
 +    create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
 +                                 NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
 +
 +    /* Read in checkpoint file if requested */
 +    sim_part = 1;
 +    if (opt2bSet("-cpi", NFILE, fnm))
 +    {
 +        snew(cr, 1);
 +        cr->duty = DUTY_PP; /* makes the following routine happy */
 +        read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
 +                                        &sim_part, &cpt_steps, cr,
 +                                        FALSE, NFILE, fnm, NULL, NULL);
 +        sfree(cr);
 +        sim_part++;
 +        /* sim_part will now be 1 if no checkpoint file was found */
 +        if (sim_part <= 1)
 +        {
 +            gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
 +        }
 +    }
 +
 +    /* Open performance output file and write header info */
 +    fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
 +
 +    /* Make a quick consistency check of command line parameters */
 +    check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
 +                maxPMEfraction, minPMEfraction, npme_fixed,
 +                bench_nsteps, fnm, NFILE, sim_part, presteps,
 +                asize(pa), pa);
 +
 +    /* Determine the maximum and minimum number of PME nodes to test,
 +     * the actual list of settings is build in do_the_tests(). */
 +    if ((nnodes > 2) && (npme_fixed < -1))
 +    {
 +        if (0 == strcmp(npmevalues_opt[0], "auto"))
 +        {
 +            /* Determine the npme range automatically based on the PME:PP load guess */
 +            if (guessPMEratio > 1.0)
 +            {
 +                /* More PME than PP work, probably we do not need separate PME nodes at all! */
 +                maxPMEnodes = nnodes/2;
 +                minPMEnodes = nnodes/2;
 +            }
 +            else
 +            {
 +                /* PME : PP load is in the range 0..1, let's test around the guess */
 +                guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
 +                minPMEnodes   = floor(0.7*guessPMEnodes);
 +                maxPMEnodes   =  ceil(1.6*guessPMEnodes);
 +                maxPMEnodes   = min(maxPMEnodes, nnodes/2);
 +            }
 +        }
 +        else
 +        {
 +            /* Determine the npme range based on user input */
 +            maxPMEnodes = floor(maxPMEfraction*nnodes);
 +            minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
 +            fprintf(stdout, "Will try runs with %d ", minPMEnodes);
 +            if (maxPMEnodes != minPMEnodes)
 +            {
 +                fprintf(stdout, "- %d ", maxPMEnodes);
 +            }
 +            fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
 +        }
 +    }
 +    else
 +    {
 +        maxPMEnodes = 0;
 +        minPMEnodes = 0;
 +    }
 +
 +    /* Get the commands we need to set up the runs from environment variables */
 +    get_program_paths(bThreads, &cmd_mpirun, cmd_np, &cmd_mdrun, repeats);
 +    if (bBenchmark && repeats > 0)
 +    {
 +        check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
 +    }
 +
 +    /* Print some header info to file */
 +    sep_line(fp);
 +    fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
 +    sep_line(fp);
 +    fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
 +    if (!bThreads)
 +    {
 +        fprintf(fp, "Number of nodes         : %d\n", nnodes);
 +        fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
 +        if (strcmp(procstring[0], "none") != 0)
 +        {
 +            fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
 +        }
 +        else
 +        {
 +            fprintf(fp, "Not setting number of nodes in system call\n");
 +        }
 +    }
 +    else
 +    {
 +        fprintf(fp, "Number of threads       : %d\n", nnodes);
 +    }
 +
 +    fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
 +    fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
 +    fprintf(fp, "Benchmark steps         : ");
 +    fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
 +    fprintf(fp, "\n");
 +    fprintf(fp, "dlb equilibration steps : %d\n", presteps);
 +    if (sim_part > 1)
 +    {
 +        fprintf(fp, "Checkpoint time step    : ");
 +        fprintf(fp, gmx_large_int_pfmt, cpt_steps);
 +        fprintf(fp, "\n");
 +    }
 +    fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
 +
 +    if (new_sim_nsteps >= 0)
 +    {
 +        bOverwrite = TRUE;
 +        fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
 +        fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
 +        fprintf(stderr, " steps.\n");
 +        fprintf(fp, "Simulation steps        : ");
 +        fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
 +        fprintf(fp, "\n");
 +    }
 +    if (repeats > 1)
 +    {
 +        fprintf(fp, "Repeats for each test   : %d\n", repeats);
 +    }
 +
 +    if (npme_fixed >= -1)
 +    {
 +        fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
 +    }
 +
 +    fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
 +    fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
 +
 +    /* Allocate memory for the inputinfo struct: */
 +    snew(info, 1);
 +    info->nr_inputfiles = ntprs;
 +    for (i = 0; i < ntprs; i++)
 +    {
 +        snew(info->rcoulomb, ntprs);
 +        snew(info->rvdw, ntprs);
 +        snew(info->rlist, ntprs);
 +        snew(info->rlistlong, ntprs);
 +        snew(info->nkx, ntprs);
 +        snew(info->nky, ntprs);
 +        snew(info->nkz, ntprs);
 +        snew(info->fsx, ntprs);
 +        snew(info->fsy, ntprs);
 +        snew(info->fsz, ntprs);
 +    }
 +    /* Make alternative tpr files to test: */
 +    snew(tpr_names, ntprs);
 +    for (i = 0; i < ntprs; i++)
 +    {
 +        snew(tpr_names[i], STRLEN);
 +    }
 +
 +    /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
 +     * different grids could be found. */
 +    make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
 +                        cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
 +
 +    /********************************************************************************/
 +    /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
 +    /********************************************************************************/
 +    snew(perfdata, ntprs);
 +    if (bBenchmark)
 +    {
 +        do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
 +                     repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
 +                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
 +
 +        fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
 +
 +        /* Analyse the results and give a suggestion for optimal settings: */
 +        bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
 +                                repeats, info, &best_tpr, &best_npme);
 +
 +        /* Take the best-performing tpr file and enlarge nsteps to original value */
 +        if (bKeepTPR && !bOverwrite)
 +        {
 +            simulation_tpr = opt2fn("-s", NFILE, fnm);
 +        }
 +        else
 +        {
 +            simulation_tpr = opt2fn("-so", NFILE, fnm);
 +            modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
 +                               info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
 +        }
 +
 +        /* Let's get rid of the temporary benchmark input files */
 +        for (i = 0; i < ntprs; i++)
 +        {
 +            fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
 +            remove(tpr_names[i]);
 +        }
 +
 +        /* Now start the real simulation if the user requested it ... */
 +        launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
 +                          cmd_args_launch, simulation_tpr, best_npme);
 +    }
 +    ffclose(fp);
 +
 +    /* ... or simply print the performance results to screen: */
 +    if (!bLaunch)
 +    {
 +        finalize(opt2fn("-p", NFILE, fnm));
 +    }
 +
 +    return 0;
 +}
diff --cc src/gromacs/gmxlib/bondfree.c
index f03f8738f3,0000000000..ae984652fa
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/bondfree.c
+++ b/src/gromacs/gmxlib/bondfree.c
@@@ -1,4401 -1,0 +1,4409 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
++#include <assert.h>
 +#include "physics.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "txtdump.h"
 +#include "bondf.h"
 +#include "smalloc.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "macros.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "mshift.h"
 +#include "main.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +
 +/* Include the SIMD macro file and then check for support */
 +#include "gmx_simd_macros.h"
 +#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_TRIGONOMETRIC
 +#define SIMD_BONDEDS
 +#include "gmx_simd_vec.h"
 +#endif
 +
 +/* Find a better place for this? */
 +const int cmap_coeff_matrix[] = {
 +    1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4,
 +    0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
 +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  9, -6,  0,  0, -6,  4,
 +    0, 0,  3, -2, 0, 0,  0,  0,  0,  0, -9,  6,  0,  0,  6, -4,
 +    0, 0,  0,  0, 1, 0, -3,  2, -2,  0,  6, -4,  1,  0, -3,  2,
 +    0, 0,  0,  0, 0, 0,  0,  0, -1,  0,  3, -2,  1,  0, -3,  2,
 +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  2,  0,  0,  3, -2,
 +    0, 0,  0,  0, 0, 0,  3, -2,  0,  0, -6,  4,  0,  0,  3, -2,
 +    0, 1, -2,  1, 0, 0,  0,  0,  0, -3,  6, -3,  0,  2, -4,  2,
 +    0, 0,  0,  0, 0, 0,  0,  0,  0,  3, -6,  3,  0, -2,  4, -2,
 +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  3,  0,  0,  2, -2,
 +    0, 0, -1,  1, 0, 0,  0,  0,  0,  0,  3, -3,  0,  0, -2,  2,
 +    0, 0,  0,  0, 0, 1, -2,  1,  0, -2,  4, -2,  0,  1, -2,  1,
 +    0, 0,  0,  0, 0, 0,  0,  0,  0, -1,  2, -1,  0,  1, -2,  1,
 +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  1, -1,  0,  0, -1,  1,
 +    0, 0,  0,  0, 0, 0, -1,  1,  0,  0,  2, -2,  0,  0, -1,  1
 +};
 +
 +
 +
 +int glatnr(int *global_atom_index, int i)
 +{
 +    int atnr;
 +
 +    if (global_atom_index == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        atnr = global_atom_index[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
 +{
 +    if (pbc)
 +    {
 +        return pbc_dx_aiuc(pbc, xi, xj, dx);
 +    }
 +    else
 +    {
 +        rvec_sub(xi, xj, dx);
 +        return CENTRAL;
 +    }
 +}
 +
 +#ifdef SIMD_BONDEDS
 +
 +/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
 +typedef struct {
 +    gmx_mm_pr inv_bzz;
 +    gmx_mm_pr inv_byy;
 +    gmx_mm_pr inv_bxx;
 +    gmx_mm_pr bzx;
 +    gmx_mm_pr bzy;
 +    gmx_mm_pr bzz;
 +    gmx_mm_pr byx;
 +    gmx_mm_pr byy;
 +    gmx_mm_pr bxx;
 +} pbc_simd_t;
 +
 +/* Set the SIMD pbc data from a normal t_pbc struct */
 +static void set_pbc_simd(const t_pbc *pbc, pbc_simd_t *pbc_simd)
 +{
 +    rvec inv_bdiag;
 +    int  d;
 +
 +    /* Setting inv_bdiag to 0 effectively turns off PBC */
 +    clear_rvec(inv_bdiag);
 +    if (pbc != NULL)
 +    {
 +        for (d = 0; d < pbc->ndim_ePBC; d++)
 +        {
 +            inv_bdiag[d] = 1.0/pbc->box[d][d];
 +        }
 +    }
 +
 +    pbc_simd->inv_bzz = gmx_set1_pr(inv_bdiag[ZZ]);
 +    pbc_simd->inv_byy = gmx_set1_pr(inv_bdiag[YY]);
 +    pbc_simd->inv_bxx = gmx_set1_pr(inv_bdiag[XX]);
 +
 +    if (pbc != NULL)
 +    {
 +        pbc_simd->bzx = gmx_set1_pr(pbc->box[ZZ][XX]);
 +        pbc_simd->bzy = gmx_set1_pr(pbc->box[ZZ][YY]);
 +        pbc_simd->bzz = gmx_set1_pr(pbc->box[ZZ][ZZ]);
 +        pbc_simd->byx = gmx_set1_pr(pbc->box[YY][XX]);
 +        pbc_simd->byy = gmx_set1_pr(pbc->box[YY][YY]);
 +        pbc_simd->bxx = gmx_set1_pr(pbc->box[XX][XX]);
 +    }
 +    else
 +    {
 +        pbc_simd->bzx = gmx_setzero_pr();
 +        pbc_simd->bzy = gmx_setzero_pr();
 +        pbc_simd->bzz = gmx_setzero_pr();
 +        pbc_simd->byx = gmx_setzero_pr();
 +        pbc_simd->byy = gmx_setzero_pr();
 +        pbc_simd->bxx = gmx_setzero_pr();
 +    }
 +}
 +
 +/* Correct distance vector *dx,*dy,*dz for PBC using SIMD */
 +static gmx_inline void
 +pbc_dx_simd(gmx_mm_pr *dx, gmx_mm_pr *dy, gmx_mm_pr *dz,
 +            const pbc_simd_t *pbc)
 +{
 +    gmx_mm_pr sh;
 +
 +    sh  = gmx_round_pr(gmx_mul_pr(*dz, pbc->inv_bzz));
 +    *dx = gmx_nmsub_pr(sh, pbc->bzx, *dx);
 +    *dy = gmx_nmsub_pr(sh, pbc->bzy, *dy);
 +    *dz = gmx_nmsub_pr(sh, pbc->bzz, *dz);
 +
 +    sh  = gmx_round_pr(gmx_mul_pr(*dy, pbc->inv_byy));
 +    *dx = gmx_nmsub_pr(sh, pbc->byx, *dx);
 +    *dy = gmx_nmsub_pr(sh, pbc->byy, *dy);
 +
 +    sh  = gmx_round_pr(gmx_mul_pr(*dx, pbc->inv_bxx));
 +    *dx = gmx_nmsub_pr(sh, pbc->bxx, *dx);
 +}
 +
 +#endif /* SIMD_BONDEDS */
 +
 +/*
 + * Morse potential bond by Frank Everdij
 + *
 + * Three parameters needed:
 + *
 + * b0 = equilibrium distance in nm
 + * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
 + * cb = well depth in kJ/mol
 + *
 + * Note: the potential is referenced to be +cb at infinite separation
 + *       and zero at the equilibrium distance!
 + */
 +
 +real morse_bonds(int nbonds,
 +                 const t_iatom forceatoms[], const t_iparams forceparams[],
 +                 const rvec x[], rvec f[], rvec fshift[],
 +                 const t_pbc *pbc, const t_graph *g,
 +                 real lambda, real *dvdlambda,
 +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                 int gmx_unused *global_atom_index)
 +{
 +    const real one = 1.0;
 +    const real two = 2.0;
 +    real       dr, dr2, temp, omtemp, cbomtemp, fbond, vbond, fij, vtot;
 +    real       b0, be, cb, b0A, beA, cbA, b0B, beB, cbB, L1;
 +    rvec       dx;
 +    int        i, m, ki, type, ai, aj;
 +    ivec       dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        b0A   = forceparams[type].morse.b0A;
 +        beA   = forceparams[type].morse.betaA;
 +        cbA   = forceparams[type].morse.cbA;
 +
 +        b0B   = forceparams[type].morse.b0B;
 +        beB   = forceparams[type].morse.betaB;
 +        cbB   = forceparams[type].morse.cbB;
 +
 +        L1 = one-lambda;                            /* 1 */
 +        b0 = L1*b0A + lambda*b0B;                   /* 3 */
 +        be = L1*beA + lambda*beB;                   /* 3 */
 +        cb = L1*cbA + lambda*cbB;                   /* 3 */
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3          */
 +        dr2  = iprod(dx, dx);                       /*   5          */
 +        dr   = dr2*gmx_invsqrt(dr2);                /*  10          */
 +        temp = exp(-be*(dr-b0));                    /*  12          */
 +
 +        if (temp == one)
 +        {
 +            /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
 +            *dvdlambda += cbB-cbA;
 +            continue;
 +        }
 +
 +        omtemp    = one-temp;                                                                                        /*   1          */
 +        cbomtemp  = cb*omtemp;                                                                                       /*   1          */
 +        vbond     = cbomtemp*omtemp;                                                                                 /*   1          */
 +        fbond     = -two*be*temp*cbomtemp*gmx_invsqrt(dr2);                                                          /*   9          */
 +        vtot     += vbond;                                                                                           /*   1          */
 +
 +        *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +
 +        for (m = 0; (m < DIM); m++)                    /*  15          */
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }                                         /*  83 TOTAL    */
 +    return vtot;
 +}
 +
 +real cubic_bonds(int nbonds,
 +                 const t_iatom forceatoms[], const t_iparams forceparams[],
 +                 const rvec x[], rvec f[], rvec fshift[],
 +                 const t_pbc *pbc, const t_graph *g,
 +                 real gmx_unused lambda, real gmx_unused *dvdlambda,
 +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                 int gmx_unused *global_atom_index)
 +{
 +    const real three = 3.0;
 +    const real two   = 2.0;
 +    real       kb, b0, kcub;
 +    real       dr, dr2, dist, kdist, kdist2, fbond, vbond, fij, vtot;
 +    rvec       dx;
 +    int        i, m, ki, type, ai, aj;
 +    ivec       dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        b0   = forceparams[type].cubic.b0;
 +        kb   = forceparams[type].cubic.kb;
 +        kcub = forceparams[type].cubic.kcub;
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);     /*   3          */
 +        dr2  = iprod(dx, dx);                           /*   5          */
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +        dr         = dr2*gmx_invsqrt(dr2);                  /*  10          */
 +        dist       = dr-b0;
 +        kdist      = kb*dist;
 +        kdist2     = kdist*dist;
 +
 +        vbond      = kdist2 + kcub*kdist2*dist;
 +        fbond      = -(two*kdist + three*kdist2*kcub)/dr;
 +
 +        vtot      += vbond;   /* 21 */
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)                    /*  15          */
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }                                         /*  54 TOTAL    */
 +    return vtot;
 +}
 +
 +real FENE_bonds(int nbonds,
 +                const t_iatom forceatoms[], const t_iparams forceparams[],
 +                const rvec x[], rvec f[], rvec fshift[],
 +                const t_pbc *pbc, const t_graph *g,
 +                real gmx_unused lambda, real gmx_unused *dvdlambda,
 +                const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                int *global_atom_index)
 +{
 +    const real half = 0.5;
 +    const real one  = 1.0;
 +    real       bm, kb;
 +    real       dr, dr2, bm2, omdr2obm2, fbond, vbond, fij, vtot;
 +    rvec       dx;
 +    int        i, m, ki, type, ai, aj;
 +    ivec       dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        bm   = forceparams[type].fene.bm;
 +        kb   = forceparams[type].fene.kb;
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);     /*   3          */
 +        dr2  = iprod(dx, dx);                           /*   5          */
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +        bm2 = bm*bm;
 +
 +        if (dr2 >= bm2)
 +        {
 +            gmx_fatal(FARGS,
 +                      "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
 +                      dr2, bm2,
 +                      glatnr(global_atom_index, ai),
 +                      glatnr(global_atom_index, aj));
 +        }
 +
 +        omdr2obm2  = one - dr2/bm2;
 +
 +        vbond      = -half*kb*bm2*log(omdr2obm2);
 +        fbond      = -kb/omdr2obm2;
 +
 +        vtot      += vbond;   /* 35 */
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)                    /*  15          */
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }                                         /*  58 TOTAL    */
 +    return vtot;
 +}
 +
 +real harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
 +              real *V, real *F)
 +{
 +    const real half = 0.5;
 +    real       L1, kk, x0, dx, dx2;
 +    real       v, f, dvdlambda;
 +
 +    L1    = 1.0-lambda;
 +    kk    = L1*kA+lambda*kB;
 +    x0    = L1*xA+lambda*xB;
 +
 +    dx    = x-x0;
 +    dx2   = dx*dx;
 +
 +    f          = -kk*dx;
 +    v          = half*kk*dx2;
 +    dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
 +
 +    *F    = f;
 +    *V    = v;
 +
 +    return dvdlambda;
 +
 +    /* That was 19 flops */
 +}
 +
 +
 +real bonds(int nbonds,
 +           const t_iatom forceatoms[], const t_iparams forceparams[],
 +           const rvec x[], rvec f[], rvec fshift[],
 +           const t_pbc *pbc, const t_graph *g,
 +           real lambda, real *dvdlambda,
 +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +           int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ki, ai, aj, type;
 +    real dr, dr2, fbond, vbond, fij, vtot;
 +    rvec dx;
 +    ivec dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
 +        dr2  = iprod(dx, dx);                       /*   5		*/
 +        dr   = dr2*gmx_invsqrt(dr2);                /*  10		*/
 +
 +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
 +                               forceparams[type].harmonic.krB,
 +                               forceparams[type].harmonic.rA,
 +                               forceparams[type].harmonic.rB,
 +                               dr, lambda, &vbond, &fbond); /*  19  */
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +
 +        vtot  += vbond;            /* 1*/
 +        fbond *= gmx_invsqrt(dr2); /*   6		*/
 +#ifdef DEBUG
 +        if (debug)
 +        {
 +            fprintf(debug, "BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +                    dr, vbond, fbond);
 +        }
 +#endif
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)     /*  15		*/
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }               /* 59 TOTAL	*/
 +    return vtot;
 +}
 +
 +real restraint_bonds(int nbonds,
 +                     const t_iatom forceatoms[], const t_iparams forceparams[],
 +                     const rvec x[], rvec f[], rvec fshift[],
 +                     const t_pbc *pbc, const t_graph *g,
 +                     real lambda, real *dvdlambda,
 +                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                     int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ki, ai, aj, type;
 +    real dr, dr2, fbond, vbond, fij, vtot;
 +    real L1;
 +    real low, dlow, up1, dup1, up2, dup2, k, dk;
 +    real drh, drh2;
 +    rvec dx;
 +    ivec dt;
 +
 +    L1   = 1.0 - lambda;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
 +        dr2  = iprod(dx, dx);                       /*   5		*/
 +        dr   = dr2*gmx_invsqrt(dr2);                /*  10		*/
 +
 +        low  = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
 +        dlow =   -forceparams[type].restraint.lowA +        forceparams[type].restraint.lowB;
 +        up1  = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
 +        dup1 =   -forceparams[type].restraint.up1A +        forceparams[type].restraint.up1B;
 +        up2  = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
 +        dup2 =   -forceparams[type].restraint.up2A +        forceparams[type].restraint.up2B;
 +        k    = L1*forceparams[type].restraint.kA   + lambda*forceparams[type].restraint.kB;
 +        dk   =   -forceparams[type].restraint.kA   +        forceparams[type].restraint.kB;
 +        /* 24 */
 +
 +        if (dr < low)
 +        {
 +            drh         = dr - low;
 +            drh2        = drh*drh;
 +            vbond       = 0.5*k*drh2;
 +            fbond       = -k*drh;
 +            *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
 +        } /* 11 */
 +        else if (dr <= up1)
 +        {
 +            vbond = 0;
 +            fbond = 0;
 +        }
 +        else if (dr <= up2)
 +        {
 +            drh         = dr - up1;
 +            drh2        = drh*drh;
 +            vbond       = 0.5*k*drh2;
 +            fbond       = -k*drh;
 +            *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
 +        } /* 11	*/
 +        else
 +        {
 +            drh         = dr - up2;
 +            vbond       = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
 +            fbond       = -k*(up2 - up1);
 +            *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
 +                + k*(dup2 - dup1)*(up2 - up1 + drh)
 +                - k*(up2 - up1)*dup2;
 +        }
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +        vtot  += vbond;            /* 1*/
 +        fbond *= gmx_invsqrt(dr2); /*   6		*/
 +#ifdef DEBUG
 +        if (debug)
 +        {
 +            fprintf(debug, "BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +                    dr, vbond, fbond);
 +        }
 +#endif
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)             /*  15		*/
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }                   /* 59 TOTAL	*/
 +
 +    return vtot;
 +}
 +
 +real polarize(int nbonds,
 +              const t_iatom forceatoms[], const t_iparams forceparams[],
 +              const rvec x[], rvec f[], rvec fshift[],
 +              const t_pbc *pbc, const t_graph *g,
 +              real lambda, real *dvdlambda,
 +              const t_mdatoms *md, t_fcdata gmx_unused *fcd,
 +              int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ki, ai, aj, type;
 +    real dr, dr2, fbond, vbond, fij, vtot, ksh;
 +    rvec dx;
 +    ivec dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ksh  = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
 +        if (debug)
 +        {
 +            fprintf(debug, "POL: local ai = %d aj = %d ksh = %.3f\n", ai, aj, ksh);
 +        }
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);                         /*   3      */
 +        dr2  = iprod(dx, dx);                                               /*   5		*/
 +        dr   = dr2*gmx_invsqrt(dr2);                                        /*  10		*/
 +
 +        *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /*  19  */
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +        vtot  += vbond;            /* 1*/
 +        fbond *= gmx_invsqrt(dr2); /*   6		*/
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)     /*  15		*/
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }               /* 59 TOTAL	*/
 +    return vtot;
 +}
 +
 +real anharm_polarize(int nbonds,
 +                     const t_iatom forceatoms[], const t_iparams forceparams[],
 +                     const rvec x[], rvec f[], rvec fshift[],
 +                     const t_pbc *pbc, const t_graph *g,
 +                     real lambda, real *dvdlambda,
 +                     const t_mdatoms *md, t_fcdata gmx_unused *fcd,
 +                     int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ki, ai, aj, type;
 +    real dr, dr2, fbond, vbond, fij, vtot, ksh, khyp, drcut, ddr, ddr3;
 +    rvec dx;
 +    ivec dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type  = forceatoms[i++];
 +        ai    = forceatoms[i++];
 +        aj    = forceatoms[i++];
 +        ksh   = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
 +        khyp  = forceparams[type].anharm_polarize.khyp;
 +        drcut = forceparams[type].anharm_polarize.drcut;
 +        if (debug)
 +        {
 +            fprintf(debug, "POL: local ai = %d aj = %d ksh = %.3f\n", ai, aj, ksh);
 +        }
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);                         /*   3      */
 +        dr2  = iprod(dx, dx);                                               /*   5		*/
 +        dr   = dr2*gmx_invsqrt(dr2);                                        /*  10		*/
 +
 +        *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /*  19  */
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +        if (dr > drcut)
 +        {
 +            ddr    = dr-drcut;
 +            ddr3   = ddr*ddr*ddr;
 +            vbond += khyp*ddr*ddr3;
 +            fbond -= 4*khyp*ddr3;
 +        }
 +        fbond *= gmx_invsqrt(dr2); /*   6		*/
 +        vtot  += vbond;            /* 1*/
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)     /*  15		*/
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }               /* 72 TOTAL	*/
 +    return vtot;
 +}
 +
 +real water_pol(int nbonds,
 +               const t_iatom forceatoms[], const t_iparams forceparams[],
 +               const rvec x[], rvec f[], rvec gmx_unused fshift[],
 +               const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
 +               real gmx_unused lambda, real gmx_unused *dvdlambda,
 +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +               int gmx_unused *global_atom_index)
 +{
 +    /* This routine implements anisotropic polarizibility for water, through
 +     * a shell connected to a dummy with spring constant that differ in the
 +     * three spatial dimensions in the molecular frame.
 +     */
 +    int  i, m, aO, aH1, aH2, aD, aS, type, type0;
 +    rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
 +#ifdef DEBUG
 +    rvec df;
 +#endif
 +    real vtot, fij, r_HH, r_OD, r_nW, tx, ty, tz, qS;
 +
 +    vtot = 0.0;
 +    if (nbonds > 0)
 +    {
 +        type0  = forceatoms[0];
 +        aS     = forceatoms[5];
 +        qS     = md->chargeA[aS];
 +        kk[XX] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
 +        kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
 +        kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
 +        r_HH   = 1.0/forceparams[type0].wpol.rHH;
 +        r_OD   = 1.0/forceparams[type0].wpol.rOD;
 +        if (debug)
 +        {
 +            fprintf(debug, "WPOL: qS  = %10.5f aS = %5d\n", qS, aS);
 +            fprintf(debug, "WPOL: kk  = %10.3f        %10.3f        %10.3f\n",
 +                    kk[XX], kk[YY], kk[ZZ]);
 +            fprintf(debug, "WPOL: rOH = %10.3f  rHH = %10.3f  rOD = %10.3f\n",
 +                    forceparams[type0].wpol.rOH,
 +                    forceparams[type0].wpol.rHH,
 +                    forceparams[type0].wpol.rOD);
 +        }
 +        for (i = 0; (i < nbonds); i += 6)
 +        {
 +            type = forceatoms[i];
 +            if (type != type0)
 +            {
 +                gmx_fatal(FARGS, "Sorry, type = %d, type0 = %d, file = %s, line = %d",
 +                          type, type0, __FILE__, __LINE__);
 +            }
 +            aO   = forceatoms[i+1];
 +            aH1  = forceatoms[i+2];
 +            aH2  = forceatoms[i+3];
 +            aD   = forceatoms[i+4];
 +            aS   = forceatoms[i+5];
 +
 +            /* Compute vectors describing the water frame */
 +            rvec_sub(x[aH1], x[aO], dOH1);
 +            rvec_sub(x[aH2], x[aO], dOH2);
 +            rvec_sub(x[aH2], x[aH1], dHH);
 +            rvec_sub(x[aD], x[aO], dOD);
 +            rvec_sub(x[aS], x[aD], dDS);
 +            cprod(dOH1, dOH2, nW);
 +
 +            /* Compute inverse length of normal vector
 +             * (this one could be precomputed, but I'm too lazy now)
 +             */
 +            r_nW = gmx_invsqrt(iprod(nW, nW));
 +            /* This is for precision, but does not make a big difference,
 +             * it can go later.
 +             */
 +            r_OD = gmx_invsqrt(iprod(dOD, dOD));
 +
 +            /* Normalize the vectors in the water frame */
 +            svmul(r_nW, nW, nW);
 +            svmul(r_HH, dHH, dHH);
 +            svmul(r_OD, dOD, dOD);
 +
 +            /* Compute displacement of shell along components of the vector */
 +            dx[ZZ] = iprod(dDS, dOD);
 +            /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
 +            for (m = 0; (m < DIM); m++)
 +            {
 +                proj[m] = dDS[m]-dx[ZZ]*dOD[m];
 +            }
 +
 +            /*dx[XX] = iprod(dDS,nW);
 +               dx[YY] = iprod(dDS,dHH);*/
 +            dx[XX] = iprod(proj, nW);
 +            for (m = 0; (m < DIM); m++)
 +            {
 +                proj[m] -= dx[XX]*nW[m];
 +            }
 +            dx[YY] = iprod(proj, dHH);
 +            /*#define DEBUG*/
 +#ifdef DEBUG
 +            if (debug)
 +            {
 +                fprintf(debug, "WPOL: dx2=%10g  dy2=%10g  dz2=%10g  sum=%10g  dDS^2=%10g\n",
 +                        sqr(dx[XX]), sqr(dx[YY]), sqr(dx[ZZ]), iprod(dx, dx), iprod(dDS, dDS));
 +                fprintf(debug, "WPOL: dHH=(%10g,%10g,%10g)\n", dHH[XX], dHH[YY], dHH[ZZ]);
 +                fprintf(debug, "WPOL: dOD=(%10g,%10g,%10g), 1/r_OD = %10g\n",
 +                        dOD[XX], dOD[YY], dOD[ZZ], 1/r_OD);
 +                fprintf(debug, "WPOL: nW =(%10g,%10g,%10g), 1/r_nW = %10g\n",
 +                        nW[XX], nW[YY], nW[ZZ], 1/r_nW);
 +                fprintf(debug, "WPOL: dx  =%10g, dy  =%10g, dz  =%10g\n",
 +                        dx[XX], dx[YY], dx[ZZ]);
 +                fprintf(debug, "WPOL: dDSx=%10g, dDSy=%10g, dDSz=%10g\n",
 +                        dDS[XX], dDS[YY], dDS[ZZ]);
 +            }
 +#endif
 +            /* Now compute the forces and energy */
 +            kdx[XX] = kk[XX]*dx[XX];
 +            kdx[YY] = kk[YY]*dx[YY];
 +            kdx[ZZ] = kk[ZZ]*dx[ZZ];
 +            vtot   += iprod(dx, kdx);
 +            for (m = 0; (m < DIM); m++)
 +            {
 +                /* This is a tensor operation but written out for speed */
 +                tx        =  nW[m]*kdx[XX];
 +                ty        = dHH[m]*kdx[YY];
 +                tz        = dOD[m]*kdx[ZZ];
 +                fij       = -tx-ty-tz;
 +#ifdef DEBUG
 +                df[m] = fij;
 +#endif
 +                f[aS][m] += fij;
 +                f[aD][m] -= fij;
 +            }
 +#ifdef DEBUG
 +            if (debug)
 +            {
 +                fprintf(debug, "WPOL: vwpol=%g\n", 0.5*iprod(dx, kdx));
 +                fprintf(debug, "WPOL: df = (%10g, %10g, %10g)\n", df[XX], df[YY], df[ZZ]);
 +            }
 +#endif
 +        }
 +    }
 +    return 0.5*vtot;
 +}
 +
 +static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
 +                       const t_pbc *pbc, real qq,
 +                       rvec fshift[], real afac)
 +{
 +    rvec r12;
 +    real r12sq, r12_1, r12n, r12bar, v0, v1, fscal, ebar, fff;
 +    int  m, t;
 +
 +    t      = pbc_rvec_sub(pbc, xi, xj, r12);                      /*  3 */
 +
 +    r12sq  = iprod(r12, r12);                                     /*  5 */
 +    r12_1  = gmx_invsqrt(r12sq);                                  /*  5 */
 +    r12bar = afac/r12_1;                                          /*  5 */
 +    v0     = qq*ONE_4PI_EPS0*r12_1;                               /*  2 */
 +    ebar   = exp(-r12bar);                                        /*  5 */
 +    v1     = (1-(1+0.5*r12bar)*ebar);                             /*  4 */
 +    fscal  = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
 +    if (debug)
 +    {
 +        fprintf(debug, "THOLE: v0 = %.3f v1 = %.3f r12= % .3f r12bar = %.3f fscal = %.3f  ebar = %.3f\n", v0, v1, 1/r12_1, r12bar, fscal, ebar);
 +    }
 +
 +    for (m = 0; (m < DIM); m++)
 +    {
 +        fff                 = fscal*r12[m];
 +        fi[m]              += fff;
 +        fj[m]              -= fff;
 +        fshift[t][m]       += fff;
 +        fshift[CENTRAL][m] -= fff;
 +    }             /* 15 */
 +
 +    return v0*v1; /* 1 */
 +    /* 54 */
 +}
 +
 +real thole_pol(int nbonds,
 +               const t_iatom forceatoms[], const t_iparams forceparams[],
 +               const rvec x[], rvec f[], rvec fshift[],
 +               const t_pbc *pbc, const t_graph gmx_unused *g,
 +               real gmx_unused lambda, real gmx_unused *dvdlambda,
 +               const t_mdatoms *md, t_fcdata gmx_unused *fcd,
 +               int gmx_unused *global_atom_index)
 +{
 +    /* Interaction between two pairs of particles with opposite charge */
 +    int  i, type, a1, da1, a2, da2;
 +    real q1, q2, qq, a, al1, al2, afac;
 +    real V = 0;
 +
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type  = forceatoms[i++];
 +        a1    = forceatoms[i++];
 +        da1   = forceatoms[i++];
 +        a2    = forceatoms[i++];
 +        da2   = forceatoms[i++];
 +        q1    = md->chargeA[da1];
 +        q2    = md->chargeA[da2];
 +        a     = forceparams[type].thole.a;
 +        al1   = forceparams[type].thole.alpha1;
 +        al2   = forceparams[type].thole.alpha2;
 +        qq    = q1*q2;
 +        afac  = a*pow(al1*al2, -1.0/6.0);
 +        V    += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
 +        V    += do_1_thole(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
 +        V    += do_1_thole(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
 +        V    += do_1_thole(x[da1], x[da2], f[da1], f[da2], pbc, qq, fshift, afac);
 +    }
 +    /* 290 flops */
 +    return V;
 +}
 +
 +real bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
 +                rvec r_ij, rvec r_kj, real *costh,
 +                int *t1, int *t2)
 +/* Return value is the angle between the bonds i-j and j-k */
 +{
 +    /* 41 FLOPS */
 +    real th;
 +
 +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3		*/
 +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3		*/
 +
 +    *costh = cos_angle(r_ij, r_kj);        /* 25		*/
 +    th     = acos(*costh);                 /* 10		*/
 +    /* 41 TOTAL	*/
 +    return th;
 +}
 +
 +real angles(int nbonds,
 +            const t_iatom forceatoms[], const t_iparams forceparams[],
 +            const rvec x[], rvec f[], rvec fshift[],
 +            const t_pbc *pbc, const t_graph *g,
 +            real lambda, real *dvdlambda,
 +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +            int gmx_unused *global_atom_index)
 +{
 +    int  i, ai, aj, ak, t1, t2, type;
 +    rvec r_ij, r_kj;
 +    real cos_theta, cos_theta2, theta, dVdt, va, vtot;
 +    ivec jt, dt_ij, dt_kj;
 +
 +    vtot = 0.0;
 +    for (i = 0; i < nbonds; )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +
 +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
 +                            r_ij, r_kj, &cos_theta, &t1, &t2);  /*  41		*/
 +
 +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
 +                               forceparams[type].harmonic.krB,
 +                               forceparams[type].harmonic.rA*DEG2RAD,
 +                               forceparams[type].harmonic.rB*DEG2RAD,
 +                               theta, lambda, &va, &dVdt);  /*  21  */
 +        vtot += va;
 +
 +        cos_theta2 = sqr(cos_theta);
 +        if (cos_theta2 < 1)
 +        {
 +            int  m;
 +            real st, sth;
 +            real cik, cii, ckk;
 +            real nrkj2, nrij2;
 +            real nrkj_1, nrij_1;
 +            rvec f_i, f_j, f_k;
 +
 +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12		*/
 +            sth = st*cos_theta;                     /*   1		*/
 +#ifdef DEBUG
 +            if (debug)
 +            {
 +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +                        theta*RAD2DEG, va, dVdt);
 +            }
 +#endif
 +            nrij2 = iprod(r_ij, r_ij);      /*   5		*/
 +            nrkj2 = iprod(r_kj, r_kj);      /*   5		*/
 +
 +            nrij_1 = gmx_invsqrt(nrij2);    /*  10		*/
 +            nrkj_1 = gmx_invsqrt(nrkj2);    /*  10		*/
 +
 +            cik = st*nrij_1*nrkj_1;         /*   2		*/
 +            cii = sth*nrij_1*nrij_1;        /*   2		*/
 +            ckk = sth*nrkj_1*nrkj_1;        /*   2		*/
 +
 +            for (m = 0; m < DIM; m++)
 +            {           /*  39		*/
 +                f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
 +                f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
 +                f_j[m]    = -f_i[m] - f_k[m];
 +                f[ai][m] += f_i[m];
 +                f[aj][m] += f_j[m];
 +                f[ak][m] += f_k[m];
 +            }
 +            if (g != NULL)
 +            {
 +                copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +                t1 = IVEC2IS(dt_ij);
 +                t2 = IVEC2IS(dt_kj);
 +            }
 +            rvec_inc(fshift[t1], f_i);
 +            rvec_inc(fshift[CENTRAL], f_j);
 +            rvec_inc(fshift[t2], f_k);
 +        }                                           /* 161 TOTAL	*/
 +    }
 +
 +    return vtot;
 +}
 +
 +#ifdef SIMD_BONDEDS
 +
 +/* As angles, but using SIMD to calculate many dihedrals at once.
 + * This routines does not calculate energies and shift forces.
 + */
 +static gmx_inline void
 +angles_noener_simd(int nbonds,
 +                   const t_iatom forceatoms[], const t_iparams forceparams[],
 +                   const rvec x[], rvec f[],
 +                   const t_pbc *pbc, const t_graph gmx_unused *g,
 +                   real gmx_unused lambda,
 +                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                   int gmx_unused *global_atom_index)
 +{
 +#define UNROLL GMX_SIMD_WIDTH_HERE
 +    const int      nfa1 = 4;
 +    int            i, iu, s, m;
 +    int            type, ai[UNROLL], aj[UNROLL], ak[UNROLL];
 +    real           coeff_array[2*UNROLL+UNROLL], *coeff;
 +    real           dr_array[2*DIM*UNROLL+UNROLL], *dr;
 +    real           f_buf_array[6*UNROLL+UNROLL], *f_buf;
 +    gmx_mm_pr      k_S, theta0_S;
 +    gmx_mm_pr      rijx_S, rijy_S, rijz_S;
 +    gmx_mm_pr      rkjx_S, rkjy_S, rkjz_S;
 +    gmx_mm_pr      one_S;
 +    gmx_mm_pr      rij_rkj_S;
 +    gmx_mm_pr      nrij2_S, nrij_1_S;
 +    gmx_mm_pr      nrkj2_S, nrkj_1_S;
 +    gmx_mm_pr      cos_S, sin_S;
 +    gmx_mm_pr      theta_S;
 +    gmx_mm_pr      st_S, sth_S;
 +    gmx_mm_pr      cik_S, cii_S, ckk_S;
 +    gmx_mm_pr      f_ix_S, f_iy_S, f_iz_S;
 +    gmx_mm_pr      f_kx_S, f_ky_S, f_kz_S;
 +    pbc_simd_t     pbc_simd;
 +
 +    /* Ensure register memory alignment */
 +    coeff = gmx_simd_align_real(coeff_array);
 +    dr    = gmx_simd_align_real(dr_array);
 +    f_buf = gmx_simd_align_real(f_buf_array);
 +
 +    set_pbc_simd(pbc, &pbc_simd);
 +
 +    one_S = gmx_set1_pr(1.0);
 +
 +    /* nbonds is the number of angles times nfa1, here we step UNROLL angles */
 +    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
 +    {
 +        /* Collect atoms for UNROLL angles.
 +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
 +         */
 +        iu = i;
 +        for (s = 0; s < UNROLL; s++)
 +        {
 +            type  = forceatoms[iu];
 +            ai[s] = forceatoms[iu+1];
 +            aj[s] = forceatoms[iu+2];
 +            ak[s] = forceatoms[iu+3];
 +
 +            coeff[s]        = forceparams[type].harmonic.krA;
 +            coeff[UNROLL+s] = forceparams[type].harmonic.rA*DEG2RAD;
 +
 +            /* If you can't use pbc_dx_simd below for PBC, e.g. because
 +             * you can't round in SIMD, use pbc_rvec_sub here.
 +             */
 +            /* Store the non PBC corrected distances packed and aligned */
 +            for (m = 0; m < DIM; m++)
 +            {
 +                dr[s +      m *UNROLL] = x[ai[s]][m] - x[aj[s]][m];
 +                dr[s + (DIM+m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
 +            }
 +
 +            /* At the end fill the arrays with identical entries */
 +            if (iu + nfa1 < nbonds)
 +            {
 +                iu += nfa1;
 +            }
 +        }
 +
 +        k_S       = gmx_load_pr(coeff);
 +        theta0_S  = gmx_load_pr(coeff+UNROLL);
 +
 +        rijx_S    = gmx_load_pr(dr + 0*UNROLL);
 +        rijy_S    = gmx_load_pr(dr + 1*UNROLL);
 +        rijz_S    = gmx_load_pr(dr + 2*UNROLL);
 +        rkjx_S    = gmx_load_pr(dr + 3*UNROLL);
 +        rkjy_S    = gmx_load_pr(dr + 4*UNROLL);
 +        rkjz_S    = gmx_load_pr(dr + 5*UNROLL);
 +
 +        pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
 +        pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
 +
 +        rij_rkj_S = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
 +                                 rkjx_S, rkjy_S, rkjz_S);
 +
 +        nrij2_S   = gmx_norm2_pr(rijx_S, rijy_S, rijz_S);
 +        nrkj2_S   = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
 +
 +        nrij_1_S  = gmx_invsqrt_pr(nrij2_S);
 +        nrkj_1_S  = gmx_invsqrt_pr(nrkj2_S);
 +
 +        cos_S     = gmx_mul_pr(rij_rkj_S, gmx_mul_pr(nrij_1_S, nrkj_1_S));
 +
 +        theta_S   = gmx_acos_pr(cos_S);
 +
 +        sin_S     = gmx_invsqrt_pr(gmx_max_pr(gmx_sub_pr(one_S, gmx_mul_pr(cos_S, cos_S)),
 +                                              gmx_setzero_pr()));
 +        st_S      = gmx_mul_pr(gmx_mul_pr(k_S, gmx_sub_pr(theta0_S, theta_S)),
 +                               sin_S);
 +        sth_S     = gmx_mul_pr(st_S, cos_S);
 +
 +        cik_S     = gmx_mul_pr(st_S,  gmx_mul_pr(nrij_1_S, nrkj_1_S));
 +        cii_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrij_1_S, nrij_1_S));
 +        ckk_S     = gmx_mul_pr(sth_S, gmx_mul_pr(nrkj_1_S, nrkj_1_S));
 +
 +        f_ix_S    = gmx_mul_pr(cii_S, rijx_S);
 +        f_ix_S    = gmx_nmsub_pr(cik_S, rkjx_S, f_ix_S);
 +        f_iy_S    = gmx_mul_pr(cii_S, rijy_S);
 +        f_iy_S    = gmx_nmsub_pr(cik_S, rkjy_S, f_iy_S);
 +        f_iz_S    = gmx_mul_pr(cii_S, rijz_S);
 +        f_iz_S    = gmx_nmsub_pr(cik_S, rkjz_S, f_iz_S);
 +        f_kx_S    = gmx_mul_pr(ckk_S, rkjx_S);
 +        f_kx_S    = gmx_nmsub_pr(cik_S, rijx_S, f_kx_S);
 +        f_ky_S    = gmx_mul_pr(ckk_S, rkjy_S);
 +        f_ky_S    = gmx_nmsub_pr(cik_S, rijy_S, f_ky_S);
 +        f_kz_S    = gmx_mul_pr(ckk_S, rkjz_S);
 +        f_kz_S    = gmx_nmsub_pr(cik_S, rijz_S, f_kz_S);
 +
 +        gmx_store_pr(f_buf + 0*UNROLL, f_ix_S);
 +        gmx_store_pr(f_buf + 1*UNROLL, f_iy_S);
 +        gmx_store_pr(f_buf + 2*UNROLL, f_iz_S);
 +        gmx_store_pr(f_buf + 3*UNROLL, f_kx_S);
 +        gmx_store_pr(f_buf + 4*UNROLL, f_ky_S);
 +        gmx_store_pr(f_buf + 5*UNROLL, f_kz_S);
 +
 +        iu = i;
 +        s  = 0;
 +        do
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                f[ai[s]][m] += f_buf[s + m*UNROLL];
 +                f[aj[s]][m] -= f_buf[s + m*UNROLL] + f_buf[s + (DIM+m)*UNROLL];
 +                f[ak[s]][m] += f_buf[s + (DIM+m)*UNROLL];
 +            }
 +            s++;
 +            iu += nfa1;
 +        }
 +        while (s < UNROLL && iu < nbonds);
 +    }
 +#undef UNROLL
 +}
 +
 +#endif /* SIMD_BONDEDS */
 +
 +real linear_angles(int nbonds,
 +                   const t_iatom forceatoms[], const t_iparams forceparams[],
 +                   const rvec x[], rvec f[], rvec fshift[],
 +                   const t_pbc *pbc, const t_graph *g,
 +                   real lambda, real *dvdlambda,
 +                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                   int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ai, aj, ak, t1, t2, type;
 +    rvec f_i, f_j, f_k;
 +    real L1, kA, kB, aA, aB, dr, dr2, va, vtot, a, b, klin;
 +    ivec jt, dt_ij, dt_kj;
 +    rvec r_ij, r_kj, r_ik, dx;
 +
 +    L1   = 1-lambda;
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +
 +        kA   = forceparams[type].linangle.klinA;
 +        kB   = forceparams[type].linangle.klinB;
 +        klin = L1*kA + lambda*kB;
 +
 +        aA   = forceparams[type].linangle.aA;
 +        aB   = forceparams[type].linangle.aB;
 +        a    = L1*aA+lambda*aB;
 +        b    = 1-a;
 +
 +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
 +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
 +        rvec_sub(r_ij, r_kj, r_ik);
 +
 +        dr2 = 0;
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            dr        = -a * r_ij[m] - b * r_kj[m];
 +            dr2      += dr*dr;
 +            dx[m]     = dr;
 +            f_i[m]    = a*klin*dr;
 +            f_k[m]    = b*klin*dr;
 +            f_j[m]    = -(f_i[m]+f_k[m]);
 +            f[ai][m] += f_i[m];
 +            f[aj][m] += f_j[m];
 +            f[ak][m] += f_k[m];
 +        }
 +        va          = 0.5*klin*dr2;
 +        *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx, r_ik);
 +
 +        vtot += va;
 +
 +        if (g)
 +        {
 +            copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +            t1 = IVEC2IS(dt_ij);
 +            t2 = IVEC2IS(dt_kj);
 +        }
 +        rvec_inc(fshift[t1], f_i);
 +        rvec_inc(fshift[CENTRAL], f_j);
 +        rvec_inc(fshift[t2], f_k);
 +    }                                         /* 57 TOTAL	*/
 +    return vtot;
 +}
 +
 +real urey_bradley(int nbonds,
 +                  const t_iatom forceatoms[], const t_iparams forceparams[],
 +                  const rvec x[], rvec f[], rvec fshift[],
 +                  const t_pbc *pbc, const t_graph *g,
 +                  real lambda, real *dvdlambda,
 +                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                  int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ai, aj, ak, t1, t2, type, ki;
 +    rvec r_ij, r_kj, r_ik;
 +    real cos_theta, cos_theta2, theta;
 +    real dVdt, va, vtot, dr, dr2, vbond, fbond, fik;
 +    real kthA, th0A, kUBA, r13A, kthB, th0B, kUBB, r13B;
 +    ivec jt, dt_ij, dt_kj, dt_ik;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type  = forceatoms[i++];
 +        ai    = forceatoms[i++];
 +        aj    = forceatoms[i++];
 +        ak    = forceatoms[i++];
 +        th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
 +        kthA  = forceparams[type].u_b.kthetaA;
 +        r13A  = forceparams[type].u_b.r13A;
 +        kUBA  = forceparams[type].u_b.kUBA;
 +        th0B  = forceparams[type].u_b.thetaB*DEG2RAD;
 +        kthB  = forceparams[type].u_b.kthetaB;
 +        r13B  = forceparams[type].u_b.r13B;
 +        kUBB  = forceparams[type].u_b.kUBB;
 +
 +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
 +                            r_ij, r_kj, &cos_theta, &t1, &t2);                     /*  41		*/
 +
 +        *dvdlambda += harmonic(kthA, kthB, th0A, th0B, theta, lambda, &va, &dVdt); /*  21  */
 +        vtot       += va;
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);                               /*   3      */
 +        dr2  = iprod(r_ik, r_ik);                                                   /*   5		*/
 +        dr   = dr2*gmx_invsqrt(dr2);                                                /*  10		*/
 +
 +        *dvdlambda += harmonic(kUBA, kUBB, r13A, r13B, dr, lambda, &vbond, &fbond); /*  19  */
 +
 +        cos_theta2 = sqr(cos_theta);                                                /*   1		*/
 +        if (cos_theta2 < 1)
 +        {
 +            real st, sth;
 +            real cik, cii, ckk;
 +            real nrkj2, nrij2;
 +            rvec f_i, f_j, f_k;
 +
 +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12		*/
 +            sth = st*cos_theta;                     /*   1		*/
 +#ifdef DEBUG
 +            if (debug)
 +            {
 +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +                        theta*RAD2DEG, va, dVdt);
 +            }
 +#endif
 +            nrkj2 = iprod(r_kj, r_kj);  /*   5		*/
 +            nrij2 = iprod(r_ij, r_ij);
 +
 +            cik = st*gmx_invsqrt(nrkj2*nrij2); /*  12		*/
 +            cii = sth/nrij2;                   /*  10		*/
 +            ckk = sth/nrkj2;                   /*  10		*/
 +
 +            for (m = 0; (m < DIM); m++)        /*  39		*/
 +            {
 +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
 +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
 +                f_j[m]    = -f_i[m]-f_k[m];
 +                f[ai][m] += f_i[m];
 +                f[aj][m] += f_j[m];
 +                f[ak][m] += f_k[m];
 +            }
 +            if (g)
 +            {
 +                copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +                t1 = IVEC2IS(dt_ij);
 +                t2 = IVEC2IS(dt_kj);
 +            }
 +            rvec_inc(fshift[t1], f_i);
 +            rvec_inc(fshift[CENTRAL], f_j);
 +            rvec_inc(fshift[t2], f_k);
 +        }                                       /* 161 TOTAL	*/
 +        /* Time for the bond calculations */
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +        vtot  += vbond;            /* 1*/
 +        fbond *= gmx_invsqrt(dr2); /*   6		*/
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), dt_ik);
 +            ki = IVEC2IS(dt_ik);
 +        }
 +        for (m = 0; (m < DIM); m++)     /*  15		*/
 +        {
 +            fik                 = fbond*r_ik[m];
 +            f[ai][m]           += fik;
 +            f[ak][m]           -= fik;
 +            fshift[ki][m]      += fik;
 +            fshift[CENTRAL][m] -= fik;
 +        }
 +    }
 +    return vtot;
 +}
 +
 +real quartic_angles(int nbonds,
 +                    const t_iatom forceatoms[], const t_iparams forceparams[],
 +                    const rvec x[], rvec f[], rvec fshift[],
 +                    const t_pbc *pbc, const t_graph *g,
 +                    real gmx_unused lambda, real gmx_unused *dvdlambda,
 +                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                    int gmx_unused *global_atom_index)
 +{
 +    int  i, j, ai, aj, ak, t1, t2, type;
 +    rvec r_ij, r_kj;
 +    real cos_theta, cos_theta2, theta, dt, dVdt, va, dtp, c, vtot;
 +    ivec jt, dt_ij, dt_kj;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +
 +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
 +                            r_ij, r_kj, &cos_theta, &t1, &t2); /*  41		*/
 +
 +        dt = theta - forceparams[type].qangle.theta*DEG2RAD;   /* 2          */
 +
 +        dVdt = 0;
 +        va   = forceparams[type].qangle.c[0];
 +        dtp  = 1.0;
 +        for (j = 1; j <= 4; j++)
 +        {
 +            c     = forceparams[type].qangle.c[j];
 +            dVdt -= j*c*dtp;
 +            dtp  *= dt;
 +            va   += c*dtp;
 +        }
 +        /* 20 */
 +
 +        vtot += va;
 +
 +        cos_theta2 = sqr(cos_theta);            /*   1		*/
 +        if (cos_theta2 < 1)
 +        {
 +            int  m;
 +            real st, sth;
 +            real cik, cii, ckk;
 +            real nrkj2, nrij2;
 +            rvec f_i, f_j, f_k;
 +
 +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12		*/
 +            sth = st*cos_theta;                     /*   1		*/
 +#ifdef DEBUG
 +            if (debug)
 +            {
 +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +                        theta*RAD2DEG, va, dVdt);
 +            }
 +#endif
 +            nrkj2 = iprod(r_kj, r_kj);  /*   5		*/
 +            nrij2 = iprod(r_ij, r_ij);
 +
 +            cik = st*gmx_invsqrt(nrkj2*nrij2); /*  12		*/
 +            cii = sth/nrij2;                   /*  10		*/
 +            ckk = sth/nrkj2;                   /*  10		*/
 +
 +            for (m = 0; (m < DIM); m++)        /*  39		*/
 +            {
 +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
 +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
 +                f_j[m]    = -f_i[m]-f_k[m];
 +                f[ai][m] += f_i[m];
 +                f[aj][m] += f_j[m];
 +                f[ak][m] += f_k[m];
 +            }
 +            if (g)
 +            {
 +                copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +                t1 = IVEC2IS(dt_ij);
 +                t2 = IVEC2IS(dt_kj);
 +            }
 +            rvec_inc(fshift[t1], f_i);
 +            rvec_inc(fshift[CENTRAL], f_j);
 +            rvec_inc(fshift[t2], f_k);
 +        }                                       /* 153 TOTAL	*/
 +    }
 +    return vtot;
 +}
 +
 +real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
 +               const t_pbc *pbc,
 +               rvec r_ij, rvec r_kj, rvec r_kl, rvec m, rvec n,
 +               real *sign, int *t1, int *t2, int *t3)
 +{
 +    real ipr, phi;
 +
 +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3        */
 +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3		*/
 +    *t3 = pbc_rvec_sub(pbc, xk, xl, r_kl); /*  3		*/
 +
 +    cprod(r_ij, r_kj, m);                  /*  9        */
 +    cprod(r_kj, r_kl, n);                  /*  9		*/
 +    phi     = gmx_angle(m, n);             /* 49 (assuming 25 for atan2) */
 +    ipr     = iprod(r_ij, n);              /*  5        */
 +    (*sign) = (ipr < 0.0) ? -1.0 : 1.0;
 +    phi     = (*sign)*phi;                 /*  1		*/
 +    /* 82 TOTAL	*/
 +    return phi;
 +}
 +
 +
 +#ifdef SIMD_BONDEDS
 +
 +/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
 + * also calculates the pre-factor required for the dihedral force update.
 + * Note that bv and buf should be register aligned.
 + */
 +static gmx_inline void
 +dih_angle_simd(const rvec *x,
 +               const int *ai, const int *aj, const int *ak, const int *al,
 +               const pbc_simd_t *pbc,
 +               real *dr,
 +               gmx_mm_pr *phi_S,
 +               gmx_mm_pr *mx_S, gmx_mm_pr *my_S, gmx_mm_pr *mz_S,
 +               gmx_mm_pr *nx_S, gmx_mm_pr *ny_S, gmx_mm_pr *nz_S,
 +               gmx_mm_pr *nrkj_m2_S,
 +               gmx_mm_pr *nrkj_n2_S,
 +               real *p,
 +               real *q)
 +{
 +#define UNROLL GMX_SIMD_WIDTH_HERE
 +    int       s, m;
 +    gmx_mm_pr rijx_S, rijy_S, rijz_S;
 +    gmx_mm_pr rkjx_S, rkjy_S, rkjz_S;
 +    gmx_mm_pr rklx_S, rkly_S, rklz_S;
 +    gmx_mm_pr cx_S, cy_S, cz_S;
 +    gmx_mm_pr cn_S;
 +    gmx_mm_pr s_S;
 +    gmx_mm_pr ipr_S;
 +    gmx_mm_pr iprm_S, iprn_S;
 +    gmx_mm_pr nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
 +    gmx_mm_pr p_S, q_S;
 +    gmx_mm_pr fmin_S = gmx_set1_pr(GMX_FLOAT_MIN);
 +
 +    for (s = 0; s < UNROLL; s++)
 +    {
 +        /* If you can't use pbc_dx_simd below for PBC, e.g. because
 +         * you can't round in SIMD, use pbc_rvec_sub here.
 +         */
 +        for (m = 0; m < DIM; m++)
 +        {
 +            dr[s + (0*DIM + m)*UNROLL] = x[ai[s]][m] - x[aj[s]][m];
 +            dr[s + (1*DIM + m)*UNROLL] = x[ak[s]][m] - x[aj[s]][m];
 +            dr[s + (2*DIM + m)*UNROLL] = x[ak[s]][m] - x[al[s]][m];
 +        }
 +    }
 +
 +    rijx_S = gmx_load_pr(dr + 0*UNROLL);
 +    rijy_S = gmx_load_pr(dr + 1*UNROLL);
 +    rijz_S = gmx_load_pr(dr + 2*UNROLL);
 +    rkjx_S = gmx_load_pr(dr + 3*UNROLL);
 +    rkjy_S = gmx_load_pr(dr + 4*UNROLL);
 +    rkjz_S = gmx_load_pr(dr + 5*UNROLL);
 +    rklx_S = gmx_load_pr(dr + 6*UNROLL);
 +    rkly_S = gmx_load_pr(dr + 7*UNROLL);
 +    rklz_S = gmx_load_pr(dr + 8*UNROLL);
 +
 +    pbc_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
 +    pbc_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
 +    pbc_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc);
 +
 +    gmx_cprod_pr(rijx_S, rijy_S, rijz_S,
 +                 rkjx_S, rkjy_S, rkjz_S,
 +                 mx_S, my_S, mz_S);
 +
 +    gmx_cprod_pr(rkjx_S, rkjy_S, rkjz_S,
 +                 rklx_S, rkly_S, rklz_S,
 +                 nx_S, ny_S, nz_S);
 +
 +    gmx_cprod_pr(*mx_S, *my_S, *mz_S,
 +                 *nx_S, *ny_S, *nz_S,
 +                 &cx_S, &cy_S, &cz_S);
 +
 +    cn_S       = gmx_sqrt_pr(gmx_norm2_pr(cx_S, cy_S, cz_S));
 +
 +    s_S        = gmx_iprod_pr(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
 +
 +    /* Determine the dihedral angle, the sign might need correction */
 +    *phi_S     = gmx_atan2_pr(cn_S, s_S);
 +
 +    ipr_S      = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
 +                              *nx_S, *ny_S, *nz_S);
 +
 +    iprm_S     = gmx_norm2_pr(*mx_S, *my_S, *mz_S);
 +    iprn_S     = gmx_norm2_pr(*nx_S, *ny_S, *nz_S);
 +
 +    nrkj2_S    = gmx_norm2_pr(rkjx_S, rkjy_S, rkjz_S);
 +
 +    /* Avoid division by zero. When zero, the result is multiplied by 0
 +     * anyhow, so the 3 max below do not affect the final result.
 +     */
 +    nrkj2_S    = gmx_max_pr(nrkj2_S, fmin_S);
 +    nrkj_1_S   = gmx_invsqrt_pr(nrkj2_S);
 +    nrkj_2_S   = gmx_mul_pr(nrkj_1_S, nrkj_1_S);
 +    nrkj_S     = gmx_mul_pr(nrkj2_S, nrkj_1_S);
 +
 +    iprm_S     = gmx_max_pr(iprm_S, fmin_S);
 +    iprn_S     = gmx_max_pr(iprn_S, fmin_S);
 +    *nrkj_m2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprm_S));
 +    *nrkj_n2_S = gmx_mul_pr(nrkj_S, gmx_inv_pr(iprn_S));
 +
 +    /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
 +    *phi_S     = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
 +
 +    p_S        = gmx_iprod_pr(rijx_S, rijy_S, rijz_S,
 +                              rkjx_S, rkjy_S, rkjz_S);
 +    p_S        = gmx_mul_pr(p_S, nrkj_2_S);
 +
 +    q_S        = gmx_iprod_pr(rklx_S, rkly_S, rklz_S,
 +                              rkjx_S, rkjy_S, rkjz_S);
 +    q_S        = gmx_mul_pr(q_S, nrkj_2_S);
 +
 +    gmx_store_pr(p, p_S);
 +    gmx_store_pr(q, q_S);
 +#undef UNROLL
 +}
 +
 +#endif /* SIMD_BONDEDS */
 +
 +
 +void do_dih_fup(int i, int j, int k, int l, real ddphi,
 +                rvec r_ij, rvec r_kj, rvec r_kl,
 +                rvec m, rvec n, rvec f[], rvec fshift[],
 +                const t_pbc *pbc, const t_graph *g,
 +                const rvec x[], int t1, int t2, int t3)
 +{
 +    /* 143 FLOPS */
 +    rvec f_i, f_j, f_k, f_l;
 +    rvec uvec, vvec, svec, dx_jl;
 +    real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
 +    real a, b, p, q, toler;
 +    ivec jt, dt_ij, dt_kj, dt_lj;
 +
 +    iprm  = iprod(m, m);       /*  5    */
 +    iprn  = iprod(n, n);       /*  5	*/
 +    nrkj2 = iprod(r_kj, r_kj); /*  5	*/
 +    toler = nrkj2*GMX_REAL_EPS;
 +    if ((iprm > toler) && (iprn > toler))
 +    {
 +        nrkj_1 = gmx_invsqrt(nrkj2); /* 10	*/
 +        nrkj_2 = nrkj_1*nrkj_1;      /*  1	*/
 +        nrkj   = nrkj2*nrkj_1;       /*  1	*/
 +        a      = -ddphi*nrkj/iprm;   /* 11	*/
 +        svmul(a, m, f_i);            /*  3	*/
 +        b     = ddphi*nrkj/iprn;     /* 11	*/
 +        svmul(b, n, f_l);            /*  3  */
 +        p     = iprod(r_ij, r_kj);   /*  5	*/
 +        p    *= nrkj_2;              /*  1	*/
 +        q     = iprod(r_kl, r_kj);   /*  5	*/
 +        q    *= nrkj_2;              /*  1	*/
 +        svmul(p, f_i, uvec);         /*  3	*/
 +        svmul(q, f_l, vvec);         /*  3	*/
 +        rvec_sub(uvec, vvec, svec);  /*  3	*/
 +        rvec_sub(f_i, svec, f_j);    /*  3	*/
 +        rvec_add(f_l, svec, f_k);    /*  3	*/
 +        rvec_inc(f[i], f_i);         /*  3	*/
 +        rvec_dec(f[j], f_j);         /*  3	*/
 +        rvec_dec(f[k], f_k);         /*  3	*/
 +        rvec_inc(f[l], f_l);         /*  3	*/
 +
 +        if (g)
 +        {
 +            copy_ivec(SHIFT_IVEC(g, j), jt);
 +            ivec_sub(SHIFT_IVEC(g, i), jt, dt_ij);
 +            ivec_sub(SHIFT_IVEC(g, k), jt, dt_kj);
 +            ivec_sub(SHIFT_IVEC(g, l), jt, dt_lj);
 +            t1 = IVEC2IS(dt_ij);
 +            t2 = IVEC2IS(dt_kj);
 +            t3 = IVEC2IS(dt_lj);
 +        }
 +        else if (pbc)
 +        {
 +            t3 = pbc_rvec_sub(pbc, x[l], x[j], dx_jl);
 +        }
 +        else
 +        {
 +            t3 = CENTRAL;
 +        }
 +
 +        rvec_inc(fshift[t1], f_i);
 +        rvec_dec(fshift[CENTRAL], f_j);
 +        rvec_dec(fshift[t2], f_k);
 +        rvec_inc(fshift[t3], f_l);
 +    }
 +    /* 112 TOTAL    */
 +}
 +
 +/* As do_dih_fup above, but without shift forces */
 +static void
 +do_dih_fup_noshiftf(int i, int j, int k, int l, real ddphi,
 +                    rvec r_ij, rvec r_kj, rvec r_kl,
 +                    rvec m, rvec n, rvec f[])
 +{
 +    rvec f_i, f_j, f_k, f_l;
 +    rvec uvec, vvec, svec, dx_jl;
 +    real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
 +    real a, b, p, q, toler;
 +    ivec jt, dt_ij, dt_kj, dt_lj;
 +
 +    iprm  = iprod(m, m);       /*  5    */
 +    iprn  = iprod(n, n);       /*  5	*/
 +    nrkj2 = iprod(r_kj, r_kj); /*  5	*/
 +    toler = nrkj2*GMX_REAL_EPS;
 +    if ((iprm > toler) && (iprn > toler))
 +    {
 +        nrkj_1 = gmx_invsqrt(nrkj2); /* 10	*/
 +        nrkj_2 = nrkj_1*nrkj_1;      /*  1	*/
 +        nrkj   = nrkj2*nrkj_1;       /*  1	*/
 +        a      = -ddphi*nrkj/iprm;   /* 11	*/
 +        svmul(a, m, f_i);            /*  3	*/
 +        b     = ddphi*nrkj/iprn;     /* 11	*/
 +        svmul(b, n, f_l);            /*  3  */
 +        p     = iprod(r_ij, r_kj);   /*  5	*/
 +        p    *= nrkj_2;              /*  1	*/
 +        q     = iprod(r_kl, r_kj);   /*  5	*/
 +        q    *= nrkj_2;              /*  1	*/
 +        svmul(p, f_i, uvec);         /*  3	*/
 +        svmul(q, f_l, vvec);         /*  3	*/
 +        rvec_sub(uvec, vvec, svec);  /*  3	*/
 +        rvec_sub(f_i, svec, f_j);    /*  3	*/
 +        rvec_add(f_l, svec, f_k);    /*  3	*/
 +        rvec_inc(f[i], f_i);         /*  3	*/
 +        rvec_dec(f[j], f_j);         /*  3	*/
 +        rvec_dec(f[k], f_k);         /*  3	*/
 +        rvec_inc(f[l], f_l);         /*  3	*/
 +    }
 +}
 +
 +/* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
 +static gmx_inline void
 +do_dih_fup_noshiftf_precalc(int i, int j, int k, int l,
 +                            real p, real q,
 +                            real f_i_x, real f_i_y, real f_i_z,
 +                            real mf_l_x, real mf_l_y, real mf_l_z,
 +                            rvec f[])
 +{
 +    rvec f_i, f_j, f_k, f_l;
 +    rvec uvec, vvec, svec;
 +
 +    f_i[XX] = f_i_x;
 +    f_i[YY] = f_i_y;
 +    f_i[ZZ] = f_i_z;
 +    f_l[XX] = -mf_l_x;
 +    f_l[YY] = -mf_l_y;
 +    f_l[ZZ] = -mf_l_z;
 +    svmul(p, f_i, uvec);
 +    svmul(q, f_l, vvec);
 +    rvec_sub(uvec, vvec, svec);
 +    rvec_sub(f_i, svec, f_j);
 +    rvec_add(f_l, svec, f_k);
 +    rvec_inc(f[i], f_i);
 +    rvec_dec(f[j], f_j);
 +    rvec_dec(f[k], f_k);
 +    rvec_inc(f[l], f_l);
 +}
 +
 +
 +real dopdihs(real cpA, real cpB, real phiA, real phiB, int mult,
 +             real phi, real lambda, real *V, real *F)
 +{
 +    real v, dvdlambda, mdphi, v1, sdphi, ddphi;
 +    real L1   = 1.0 - lambda;
 +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +    real dph0 = (phiB - phiA)*DEG2RAD;
 +    real cp   = L1*cpA + lambda*cpB;
 +
 +    mdphi =  mult*phi - ph0;
 +    sdphi = sin(mdphi);
 +    ddphi = -cp*mult*sdphi;
 +    v1    = 1.0 + cos(mdphi);
 +    v     = cp*v1;
 +
 +    dvdlambda  = (cpB - cpA)*v1 + cp*dph0*sdphi;
 +
 +    *V = v;
 +    *F = ddphi;
 +
 +    return dvdlambda;
 +
 +    /* That was 40 flops */
 +}
 +
 +static void
 +dopdihs_noener(real cpA, real cpB, real phiA, real phiB, int mult,
 +               real phi, real lambda, real *F)
 +{
 +    real mdphi, sdphi, ddphi;
 +    real L1   = 1.0 - lambda;
 +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +    real cp   = L1*cpA + lambda*cpB;
 +
 +    mdphi = mult*phi - ph0;
 +    sdphi = sin(mdphi);
 +    ddphi = -cp*mult*sdphi;
 +
 +    *F = ddphi;
 +
 +    /* That was 20 flops */
 +}
 +
 +static void
 +dopdihs_mdphi(real cpA, real cpB, real phiA, real phiB, int mult,
 +              real phi, real lambda, real *cp, real *mdphi)
 +{
 +    real L1   = 1.0 - lambda;
 +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +
 +    *cp    = L1*cpA + lambda*cpB;
 +
 +    *mdphi = mult*phi - ph0;
 +}
 +
 +static real dopdihs_min(real cpA, real cpB, real phiA, real phiB, int mult,
 +                        real phi, real lambda, real *V, real *F)
 +/* similar to dopdihs, except for a minus sign  *
 + * and a different treatment of mult/phi0       */
 +{
 +    real v, dvdlambda, mdphi, v1, sdphi, ddphi;
 +    real L1   = 1.0 - lambda;
 +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +    real dph0 = (phiB - phiA)*DEG2RAD;
 +    real cp   = L1*cpA + lambda*cpB;
 +
 +    mdphi = mult*(phi-ph0);
 +    sdphi = sin(mdphi);
 +    ddphi = cp*mult*sdphi;
 +    v1    = 1.0-cos(mdphi);
 +    v     = cp*v1;
 +
 +    dvdlambda  = (cpB-cpA)*v1 + cp*dph0*sdphi;
 +
 +    *V = v;
 +    *F = ddphi;
 +
 +    return dvdlambda;
 +
 +    /* That was 40 flops */
 +}
 +
 +real pdihs(int nbonds,
 +           const t_iatom forceatoms[], const t_iparams forceparams[],
 +           const rvec x[], rvec f[], rvec fshift[],
 +           const t_pbc *pbc, const t_graph *g,
 +           real lambda, real *dvdlambda,
 +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +           int gmx_unused *global_atom_index)
 +{
 +    int  i, type, ai, aj, ak, al;
 +    int  t1, t2, t3;
 +    rvec r_ij, r_kj, r_kl, m, n;
 +    real phi, sign, ddphi, vpd, vtot;
 +
 +    vtot = 0.0;
 +
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        al   = forceatoms[i++];
 +
 +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
 +                        &sign, &t1, &t2, &t3);  /*  84      */
 +        *dvdlambda += dopdihs(forceparams[type].pdihs.cpA,
 +                              forceparams[type].pdihs.cpB,
 +                              forceparams[type].pdihs.phiA,
 +                              forceparams[type].pdihs.phiB,
 +                              forceparams[type].pdihs.mult,
 +                              phi, lambda, &vpd, &ddphi);
 +
 +        vtot += vpd;
 +        do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
 +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112		*/
 +
 +#ifdef DEBUG
 +        fprintf(debug, "pdih: (%d,%d,%d,%d) phi=%g\n",
 +                ai, aj, ak, al, phi);
 +#endif
 +    } /* 223 TOTAL  */
 +
 +    return vtot;
 +}
 +
 +void make_dp_periodic(real *dp)  /* 1 flop? */
 +{
 +    /* dp cannot be outside (-pi,pi) */
 +    if (*dp >= M_PI)
 +    {
 +        *dp -= 2*M_PI;
 +    }
 +    else if (*dp < -M_PI)
 +    {
 +        *dp += 2*M_PI;
 +    }
 +    return;
 +}
 +
 +/* As pdihs above, but without calculating energies and shift forces */
 +static void
 +pdihs_noener(int nbonds,
 +             const t_iatom forceatoms[], const t_iparams forceparams[],
 +             const rvec x[], rvec f[],
 +             const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
 +             real lambda,
 +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +             int gmx_unused *global_atom_index)
 +{
 +    int  i, type, ai, aj, ak, al;
 +    int  t1, t2, t3;
 +    rvec r_ij, r_kj, r_kl, m, n;
 +    real phi, sign, ddphi_tot, ddphi;
 +
 +    for (i = 0; (i < nbonds); )
 +    {
 +        ai   = forceatoms[i+1];
 +        aj   = forceatoms[i+2];
 +        ak   = forceatoms[i+3];
 +        al   = forceatoms[i+4];
 +
 +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
 +                        &sign, &t1, &t2, &t3);
 +
 +        ddphi_tot = 0;
 +
 +        /* Loop over dihedrals working on the same atoms,
 +         * so we avoid recalculating angles and force distributions.
 +         */
 +        do
 +        {
 +            type = forceatoms[i];
 +            dopdihs_noener(forceparams[type].pdihs.cpA,
 +                           forceparams[type].pdihs.cpB,
 +                           forceparams[type].pdihs.phiA,
 +                           forceparams[type].pdihs.phiB,
 +                           forceparams[type].pdihs.mult,
 +                           phi, lambda, &ddphi);
 +            ddphi_tot += ddphi;
 +
 +            i += 5;
 +        }
 +        while (i < nbonds &&
 +               forceatoms[i+1] == ai &&
 +               forceatoms[i+2] == aj &&
 +               forceatoms[i+3] == ak &&
 +               forceatoms[i+4] == al);
 +
 +        do_dih_fup_noshiftf(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n, f);
 +    }
 +}
 +
 +
 +#ifdef SIMD_BONDEDS
 +
 +/* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
 +static void
 +pdihs_noener_simd(int nbonds,
 +                  const t_iatom forceatoms[], const t_iparams forceparams[],
 +                  const rvec x[], rvec f[],
 +                  const t_pbc *pbc, const t_graph gmx_unused *g,
 +                  real gmx_unused lambda,
 +                  const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                  int gmx_unused *global_atom_index)
 +{
 +#define UNROLL GMX_SIMD_WIDTH_HERE
 +    const int       nfa1 = 5;
 +    int             i, iu, s;
 +    int             type, ai[UNROLL], aj[UNROLL], ak[UNROLL], al[UNROLL];
 +    int             t1[UNROLL], t2[UNROLL], t3[UNROLL];
 +    real            ddphi;
 +    real            dr_array[3*DIM*UNROLL+UNROLL], *dr;
 +    real            buf_array[7*UNROLL+UNROLL], *buf;
 +    real           *cp, *phi0, *mult, *phi, *p, *q, *sf_i, *msf_l;
 +    gmx_mm_pr       phi0_S, phi_S;
 +    gmx_mm_pr       mx_S, my_S, mz_S;
 +    gmx_mm_pr       nx_S, ny_S, nz_S;
 +    gmx_mm_pr       nrkj_m2_S, nrkj_n2_S;
 +    gmx_mm_pr       cp_S, mdphi_S, mult_S;
 +    gmx_mm_pr       sin_S, cos_S;
 +    gmx_mm_pr       mddphi_S;
 +    gmx_mm_pr       sf_i_S, msf_l_S;
 +    pbc_simd_t      pbc_simd;
 +
 +    /* Ensure SIMD register alignment */
 +    dr  = gmx_simd_align_real(dr_array);
 +    buf = gmx_simd_align_real(buf_array);
 +
 +    /* Extract aligned pointer for parameters and variables */
 +    cp    = buf + 0*UNROLL;
 +    phi0  = buf + 1*UNROLL;
 +    mult  = buf + 2*UNROLL;
 +    p     = buf + 3*UNROLL;
 +    q     = buf + 4*UNROLL;
 +    sf_i  = buf + 5*UNROLL;
 +    msf_l = buf + 6*UNROLL;
 +
 +    set_pbc_simd(pbc, &pbc_simd);
 +
 +    /* nbonds is the number of dihedrals times nfa1, here we step UNROLL dihs */
 +    for (i = 0; (i < nbonds); i += UNROLL*nfa1)
 +    {
 +        /* Collect atoms quadruplets for UNROLL dihedrals.
 +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
 +         */
 +        iu = i;
 +        for (s = 0; s < UNROLL; s++)
 +        {
 +            type  = forceatoms[iu];
 +            ai[s] = forceatoms[iu+1];
 +            aj[s] = forceatoms[iu+2];
 +            ak[s] = forceatoms[iu+3];
 +            al[s] = forceatoms[iu+4];
 +
 +            cp[s]   = forceparams[type].pdihs.cpA;
 +            phi0[s] = forceparams[type].pdihs.phiA*DEG2RAD;
 +            mult[s] = forceparams[type].pdihs.mult;
 +
 +            /* At the end fill the arrays with identical entries */
 +            if (iu + nfa1 < nbonds)
 +            {
 +                iu += nfa1;
 +            }
 +        }
 +
 +        /* Caclulate UNROLL dihedral angles at once */
 +        dih_angle_simd(x, ai, aj, ak, al, &pbc_simd,
 +                       dr,
 +                       &phi_S,
 +                       &mx_S, &my_S, &mz_S,
 +                       &nx_S, &ny_S, &nz_S,
 +                       &nrkj_m2_S,
 +                       &nrkj_n2_S,
 +                       p, q);
 +
 +        cp_S     = gmx_load_pr(cp);
 +        phi0_S   = gmx_load_pr(phi0);
 +        mult_S   = gmx_load_pr(mult);
 +
 +        mdphi_S  = gmx_sub_pr(gmx_mul_pr(mult_S, phi_S), phi0_S);
 +
 +        /* Calculate UNROLL sines at once */
 +        gmx_sincos_pr(mdphi_S, &sin_S, &cos_S);
 +        mddphi_S = gmx_mul_pr(gmx_mul_pr(cp_S, mult_S), sin_S);
 +        sf_i_S   = gmx_mul_pr(mddphi_S, nrkj_m2_S);
 +        msf_l_S  = gmx_mul_pr(mddphi_S, nrkj_n2_S);
 +
 +        /* After this m?_S will contain f[i] */
 +        mx_S     = gmx_mul_pr(sf_i_S, mx_S);
 +        my_S     = gmx_mul_pr(sf_i_S, my_S);
 +        mz_S     = gmx_mul_pr(sf_i_S, mz_S);
 +
 +        /* After this m?_S will contain -f[l] */
 +        nx_S     = gmx_mul_pr(msf_l_S, nx_S);
 +        ny_S     = gmx_mul_pr(msf_l_S, ny_S);
 +        nz_S     = gmx_mul_pr(msf_l_S, nz_S);
 +
 +        gmx_store_pr(dr + 0*UNROLL, mx_S);
 +        gmx_store_pr(dr + 1*UNROLL, my_S);
 +        gmx_store_pr(dr + 2*UNROLL, mz_S);
 +        gmx_store_pr(dr + 3*UNROLL, nx_S);
 +        gmx_store_pr(dr + 4*UNROLL, ny_S);
 +        gmx_store_pr(dr + 5*UNROLL, nz_S);
 +
 +        iu = i;
 +        s  = 0;
 +        do
 +        {
 +            do_dih_fup_noshiftf_precalc(ai[s], aj[s], ak[s], al[s],
 +                                        p[s], q[s],
 +                                        dr[     XX *UNROLL+s],
 +                                        dr[     YY *UNROLL+s],
 +                                        dr[     ZZ *UNROLL+s],
 +                                        dr[(DIM+XX)*UNROLL+s],
 +                                        dr[(DIM+YY)*UNROLL+s],
 +                                        dr[(DIM+ZZ)*UNROLL+s],
 +                                        f);
 +            s++;
 +            iu += nfa1;
 +        }
 +        while (s < UNROLL && iu < nbonds);
 +    }
 +#undef UNROLL
 +}
 +
 +#endif /* SIMD_BONDEDS */
 +
 +
 +real idihs(int nbonds,
 +           const t_iatom forceatoms[], const t_iparams forceparams[],
 +           const rvec x[], rvec f[], rvec fshift[],
 +           const t_pbc *pbc, const t_graph *g,
 +           real lambda, real *dvdlambda,
 +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +           int gmx_unused *global_atom_index)
 +{
 +    int  i, type, ai, aj, ak, al;
 +    int  t1, t2, t3;
 +    real phi, phi0, dphi0, ddphi, sign, vtot;
 +    rvec r_ij, r_kj, r_kl, m, n;
 +    real L1, kk, dp, dp2, kA, kB, pA, pB, dvdl_term;
 +
 +    L1        = 1.0-lambda;
 +    dvdl_term = 0;
 +    vtot      = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        al   = forceatoms[i++];
 +
 +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
 +                        &sign, &t1, &t2, &t3);  /*  84		*/
 +
 +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 +         * force changes if we just apply a normal harmonic.
 +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 +         * This means we will never have the periodicity problem, unless
 +         * the dihedral is Pi away from phiO, which is very unlikely due to
 +         * the potential.
 +         */
 +        kA = forceparams[type].harmonic.krA;
 +        kB = forceparams[type].harmonic.krB;
 +        pA = forceparams[type].harmonic.rA;
 +        pB = forceparams[type].harmonic.rB;
 +
 +        kk    = L1*kA + lambda*kB;
 +        phi0  = (L1*pA + lambda*pB)*DEG2RAD;
 +        dphi0 = (pB - pA)*DEG2RAD;
 +
 +        dp = phi-phi0;
 +
 +        make_dp_periodic(&dp);
 +
 +        dp2 = dp*dp;
 +
 +        vtot += 0.5*kk*dp2;
 +        ddphi = -kk*dp;
 +
 +        dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
 +
 +        do_dih_fup(ai, aj, ak, al, (real)(-ddphi), r_ij, r_kj, r_kl, m, n,
 +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112		*/
 +        /* 218 TOTAL	*/
 +#ifdef DEBUG
 +        if (debug)
 +        {
 +            fprintf(debug, "idih: (%d,%d,%d,%d) phi=%g\n",
 +                    ai, aj, ak, al, phi);
 +        }
 +#endif
 +    }
 +
 +    *dvdlambda += dvdl_term;
 +    return vtot;
 +}
 +
 +
 +/*! \brief returns dx, rdist, and dpdl for functions posres() and fbposres()
 + */
 +static void posres_dx(const rvec x, const rvec pos0A, const rvec pos0B,
 +                      const rvec comA_sc, const rvec comB_sc,
 +                      real lambda,
 +                      t_pbc *pbc, int refcoord_scaling, int npbcdim,
 +                      rvec dx, rvec rdist, rvec dpdl)
 +{
 +    int  m, d;
 +    real posA, posB, L1, ref = 0.;
 +    rvec pos;
 +
 +    L1 = 1.0-lambda;
 +
 +    for (m = 0; m < DIM; m++)
 +    {
 +        posA = pos0A[m];
 +        posB = pos0B[m];
 +        if (m < npbcdim)
 +        {
 +            switch (refcoord_scaling)
 +            {
 +                case erscNO:
 +                    ref      = 0;
 +                    rdist[m] = L1*posA + lambda*posB;
 +                    dpdl[m]  = posB - posA;
 +                    break;
 +                case erscALL:
 +                    /* Box relative coordinates are stored for dimensions with pbc */
 +                    posA *= pbc->box[m][m];
 +                    posB *= pbc->box[m][m];
 +                    for (d = m+1; d < npbcdim; d++)
 +                    {
 +                        posA += pos0A[d]*pbc->box[d][m];
 +                        posB += pos0B[d]*pbc->box[d][m];
 +                    }
 +                    ref      = L1*posA + lambda*posB;
 +                    rdist[m] = 0;
 +                    dpdl[m]  = posB - posA;
 +                    break;
 +                case erscCOM:
 +                    ref      = L1*comA_sc[m] + lambda*comB_sc[m];
 +                    rdist[m] = L1*posA       + lambda*posB;
 +                    dpdl[m]  = comB_sc[m] - comA_sc[m] + posB - posA;
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS, "No such scaling method implemented");
 +            }
 +        }
 +        else
 +        {
 +            ref      = L1*posA + lambda*posB;
 +            rdist[m] = 0;
 +            dpdl[m]  = posB - posA;
 +        }
 +
 +        /* We do pbc_dx with ref+rdist,
 +         * since with only ref we can be up to half a box vector wrong.
 +         */
 +        pos[m] = ref + rdist[m];
 +    }
 +
 +    if (pbc)
 +    {
 +        pbc_dx(pbc, x, pos, dx);
 +    }
 +    else
 +    {
 +        rvec_sub(x, pos, dx);
 +    }
 +}
 +
 +/*! \brief Adds forces of flat-bottomed positions restraints to f[]
 + *         and fixes vir_diag. Returns the flat-bottomed potential. */
 +real fbposres(int nbonds,
 +              const t_iatom forceatoms[], const t_iparams forceparams[],
 +              const rvec x[], rvec f[], rvec vir_diag,
 +              t_pbc *pbc,
 +              int refcoord_scaling, int ePBC, rvec com)
 +/* compute flat-bottomed positions restraints */
 +{
 +    int              i, ai, m, d, type, npbcdim = 0, fbdim;
 +    const t_iparams *pr;
 +    real             vtot, kk, v;
 +    real             ref = 0, dr, dr2, rpot, rfb, rfb2, fact, invdr;
 +    rvec             com_sc, rdist, pos, dx, dpdl, fm;
 +    gmx_bool         bInvert;
 +
 +    npbcdim = ePBC2npbcdim(ePBC);
 +
 +    if (refcoord_scaling == erscCOM)
 +    {
 +        clear_rvec(com_sc);
 +        for (m = 0; m < npbcdim; m++)
 +        {
 +            for (d = m; d < npbcdim; d++)
 +            {
 +                com_sc[m] += com[d]*pbc->box[d][m];
 +            }
 +        }
 +    }
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        pr   = &forceparams[type];
 +
 +        /* same calculation as for normal posres, but with identical A and B states, and lambda==0 */
 +        posres_dx(x[ai], forceparams[type].fbposres.pos0, forceparams[type].fbposres.pos0,
 +                  com_sc, com_sc, 0.0,
 +                  pbc, refcoord_scaling, npbcdim,
 +                  dx, rdist, dpdl);
 +
 +        clear_rvec(fm);
 +        v = 0.0;
 +
 +        kk   = pr->fbposres.k;
 +        rfb  = pr->fbposres.r;
 +        rfb2 = sqr(rfb);
 +
 +        /* with rfb<0, push particle out of the sphere/cylinder/layer */
 +        bInvert = FALSE;
 +        if (rfb < 0.)
 +        {
 +            bInvert = TRUE;
 +            rfb     = -rfb;
 +        }
 +
 +        switch (pr->fbposres.geom)
 +        {
 +            case efbposresSPHERE:
 +                /* spherical flat-bottom posres */
 +                dr2 = norm2(dx);
 +                if (dr2 > 0.0 &&
 +                    ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
 +                    )
 +                {
 +                    dr   = sqrt(dr2);
 +                    v    = 0.5*kk*sqr(dr - rfb);
 +                    fact = -kk*(dr-rfb)/dr; /* Force pointing to the center pos0 */
 +                    svmul(fact, dx, fm);
 +                }
 +                break;
 +            case efbposresCYLINDER:
 +                /* cylidrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
 +                dr2 = sqr(dx[XX])+sqr(dx[YY]);
 +                if  (dr2 > 0.0 &&
 +                     ( (dr2 > rfb2 && bInvert == FALSE ) || (dr2 < rfb2 && bInvert == TRUE ) )
 +                     )
 +                {
 +                    dr     = sqrt(dr2);
 +                    invdr  = 1./dr;
 +                    v      = 0.5*kk*sqr(dr - rfb);
 +                    fm[XX] = -kk*(dr-rfb)*dx[XX]*invdr; /* Force pointing to the center */
 +                    fm[YY] = -kk*(dr-rfb)*dx[YY]*invdr;
 +                }
 +                break;
 +            case efbposresX: /* fbdim=XX */
 +            case efbposresY: /* fbdim=YY */
 +            case efbposresZ: /* fbdim=ZZ */
 +                /* 1D flat-bottom potential */
 +                fbdim = pr->fbposres.geom - efbposresX;
 +                dr    = dx[fbdim];
 +                if ( ( dr > rfb && bInvert == FALSE ) || ( 0 < dr && dr < rfb && bInvert == TRUE )  )
 +                {
 +                    v         = 0.5*kk*sqr(dr - rfb);
 +                    fm[fbdim] = -kk*(dr - rfb);
 +                }
 +                else if ( (dr < (-rfb) && bInvert == FALSE ) || ( (-rfb) < dr && dr < 0 && bInvert == TRUE ))
 +                {
 +                    v         = 0.5*kk*sqr(dr + rfb);
 +                    fm[fbdim] = -kk*(dr + rfb);
 +                }
 +                break;
 +        }
 +
 +        vtot += v;
 +
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            f[ai][m]   += fm[m];
 +            /* Here we correct for the pbc_dx which included rdist */
 +            vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm[m];
 +        }
 +    }
 +
 +    return vtot;
 +}
 +
 +
 +real posres(int nbonds,
 +            const t_iatom forceatoms[], const t_iparams forceparams[],
 +            const rvec x[], rvec f[], rvec vir_diag,
 +            t_pbc *pbc,
 +            real lambda, real *dvdlambda,
 +            int refcoord_scaling, int ePBC, rvec comA, rvec comB)
 +{
 +    int              i, ai, m, d, type, ki, npbcdim = 0;
 +    const t_iparams *pr;
 +    real             L1;
 +    real             vtot, kk, fm;
 +    real             posA, posB, ref = 0;
 +    rvec             comA_sc, comB_sc, rdist, dpdl, pos, dx;
 +    gmx_bool         bForceValid = TRUE;
 +
 +    if ((f == NULL) || (vir_diag == NULL))    /* should both be null together! */
 +    {
 +        bForceValid = FALSE;
 +    }
 +
 +    npbcdim = ePBC2npbcdim(ePBC);
 +
 +    if (refcoord_scaling == erscCOM)
 +    {
 +        clear_rvec(comA_sc);
 +        clear_rvec(comB_sc);
 +        for (m = 0; m < npbcdim; m++)
 +        {
 +            for (d = m; d < npbcdim; d++)
 +            {
 +                comA_sc[m] += comA[d]*pbc->box[d][m];
 +                comB_sc[m] += comB[d]*pbc->box[d][m];
 +            }
 +        }
 +    }
 +
 +    L1 = 1.0 - lambda;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        pr   = &forceparams[type];
 +
 +        /* return dx, rdist, and dpdl */
 +        posres_dx(x[ai], forceparams[type].posres.pos0A, forceparams[type].posres.pos0B,
 +                  comA_sc, comB_sc, lambda,
 +                  pbc, refcoord_scaling, npbcdim,
 +                  dx, rdist, dpdl);
 +
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            kk          = L1*pr->posres.fcA[m] + lambda*pr->posres.fcB[m];
 +            fm          = -kk*dx[m];
 +            vtot       += 0.5*kk*dx[m]*dx[m];
 +            *dvdlambda +=
 +                0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
 +                -fm*dpdl[m];
 +
 +            /* Here we correct for the pbc_dx which included rdist */
 +            if (bForceValid)
 +            {
 +                f[ai][m]    += fm;
 +                vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm;
 +            }
 +        }
 +    }
 +
 +    return vtot;
 +}
 +
 +static real low_angres(int nbonds,
 +                       const t_iatom forceatoms[], const t_iparams forceparams[],
 +                       const rvec x[], rvec f[], rvec fshift[],
 +                       const t_pbc *pbc, const t_graph *g,
 +                       real lambda, real *dvdlambda,
 +                       gmx_bool bZAxis)
 +{
 +    int  i, m, type, ai, aj, ak, al;
 +    int  t1, t2;
 +    real phi, cos_phi, cos_phi2, vid, vtot, dVdphi;
 +    rvec r_ij, r_kl, f_i, f_k = {0, 0, 0};
 +    real st, sth, nrij2, nrkl2, c, cij, ckl;
 +
 +    ivec dt;
 +    t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
 +
 +    vtot = 0.0;
 +    ak   = al = 0; /* to avoid warnings */
 +    for (i = 0; i < nbonds; )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        t1   = pbc_rvec_sub(pbc, x[aj], x[ai], r_ij);       /*  3		*/
 +        if (!bZAxis)
 +        {
 +            ak   = forceatoms[i++];
 +            al   = forceatoms[i++];
 +            t2   = pbc_rvec_sub(pbc, x[al], x[ak], r_kl);  /*  3		*/
 +        }
 +        else
 +        {
 +            r_kl[XX] = 0;
 +            r_kl[YY] = 0;
 +            r_kl[ZZ] = 1;
 +        }
 +
 +        cos_phi = cos_angle(r_ij, r_kl); /* 25		*/
 +        phi     = acos(cos_phi);         /* 10           */
 +
 +        *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
 +                                  forceparams[type].pdihs.cpB,
 +                                  forceparams[type].pdihs.phiA,
 +                                  forceparams[type].pdihs.phiB,
 +                                  forceparams[type].pdihs.mult,
 +                                  phi, lambda, &vid, &dVdphi); /*  40  */
 +
 +        vtot += vid;
 +
 +        cos_phi2 = sqr(cos_phi);                /*   1		*/
 +        if (cos_phi2 < 1)
 +        {
 +            st    = -dVdphi*gmx_invsqrt(1 - cos_phi2); /*  12		*/
 +            sth   = st*cos_phi;                        /*   1		*/
 +            nrij2 = iprod(r_ij, r_ij);                 /*   5		*/
 +            nrkl2 = iprod(r_kl, r_kl);                 /*   5          */
 +
 +            c   = st*gmx_invsqrt(nrij2*nrkl2);         /*  11		*/
 +            cij = sth/nrij2;                           /*  10		*/
 +            ckl = sth/nrkl2;                           /*  10		*/
 +
 +            for (m = 0; m < DIM; m++)                  /*  18+18       */
 +            {
 +                f_i[m]    = (c*r_kl[m]-cij*r_ij[m]);
 +                f[ai][m] += f_i[m];
 +                f[aj][m] -= f_i[m];
 +                if (!bZAxis)
 +                {
 +                    f_k[m]    = (c*r_ij[m]-ckl*r_kl[m]);
 +                    f[ak][m] += f_k[m];
 +                    f[al][m] -= f_k[m];
 +                }
 +            }
 +
 +            if (g)
 +            {
 +                ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +                t1 = IVEC2IS(dt);
 +            }
 +            rvec_inc(fshift[t1], f_i);
 +            rvec_dec(fshift[CENTRAL], f_i);
 +            if (!bZAxis)
 +            {
 +                if (g)
 +                {
 +                    ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, al), dt);
 +                    t2 = IVEC2IS(dt);
 +                }
 +                rvec_inc(fshift[t2], f_k);
 +                rvec_dec(fshift[CENTRAL], f_k);
 +            }
 +        }
 +    }
 +
 +    return vtot; /*  184 / 157 (bZAxis)  total  */
 +}
 +
 +real angres(int nbonds,
 +            const t_iatom forceatoms[], const t_iparams forceparams[],
 +            const rvec x[], rvec f[], rvec fshift[],
 +            const t_pbc *pbc, const t_graph *g,
 +            real lambda, real *dvdlambda,
 +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +            int gmx_unused *global_atom_index)
 +{
 +    return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
 +                      lambda, dvdlambda, FALSE);
 +}
 +
 +real angresz(int nbonds,
 +             const t_iatom forceatoms[], const t_iparams forceparams[],
 +             const rvec x[], rvec f[], rvec fshift[],
 +             const t_pbc *pbc, const t_graph *g,
 +             real lambda, real *dvdlambda,
 +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +             int gmx_unused *global_atom_index)
 +{
 +    return low_angres(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
 +                      lambda, dvdlambda, TRUE);
 +}
 +
 +real dihres(int nbonds,
 +            const t_iatom forceatoms[], const t_iparams forceparams[],
 +            const rvec x[], rvec f[], rvec fshift[],
 +            const t_pbc *pbc, const t_graph *g,
 +            real lambda, real *dvdlambda,
 +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +            int gmx_unused  *global_atom_index)
 +{
 +    real vtot = 0;
 +    int  ai, aj, ak, al, i, k, type, t1, t2, t3;
 +    real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
 +    real phi, ddphi, ddp, ddp2, dp, sign, d2r, fc, L1;
 +    rvec r_ij, r_kj, r_kl, m, n;
 +
 +    L1 = 1.0-lambda;
 +
 +    d2r = DEG2RAD;
 +    k   = 0;
 +
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        al   = forceatoms[i++];
 +
 +        phi0A  = forceparams[type].dihres.phiA*d2r;
 +        dphiA  = forceparams[type].dihres.dphiA*d2r;
 +        kfacA  = forceparams[type].dihres.kfacA;
 +
 +        phi0B  = forceparams[type].dihres.phiB*d2r;
 +        dphiB  = forceparams[type].dihres.dphiB*d2r;
 +        kfacB  = forceparams[type].dihres.kfacB;
 +
 +        phi0  = L1*phi0A + lambda*phi0B;
 +        dphi  = L1*dphiA + lambda*dphiB;
 +        kfac  = L1*kfacA + lambda*kfacB;
 +
 +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
 +                        &sign, &t1, &t2, &t3);
 +        /* 84 flops */
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "dihres[%d]: %d %d %d %d : phi=%f, dphi=%f, kfac=%f\n",
 +                    k++, ai, aj, ak, al, phi0, dphi, kfac);
 +        }
 +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 +         * force changes if we just apply a normal harmonic.
 +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 +         * This means we will never have the periodicity problem, unless
 +         * the dihedral is Pi away from phiO, which is very unlikely due to
 +         * the potential.
 +         */
 +        dp = phi-phi0;
 +        make_dp_periodic(&dp);
 +
 +        if (dp > dphi)
 +        {
 +            ddp = dp-dphi;
 +        }
 +        else if (dp < -dphi)
 +        {
 +            ddp = dp+dphi;
 +        }
 +        else
 +        {
 +            ddp = 0;
 +        }
 +
 +        if (ddp != 0.0)
 +        {
 +            ddp2  = ddp*ddp;
 +            vtot += 0.5*kfac*ddp2;
 +            ddphi = kfac*ddp;
 +
 +            *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
 +            /* lambda dependence from changing restraint distances */
 +            if (ddp > 0)
 +            {
 +                *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
 +            }
 +            else if (ddp < 0)
 +            {
 +                *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
 +            }
 +            do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
 +                       f, fshift, pbc, g, x, t1, t2, t3);      /* 112		*/
 +        }
 +    }
 +    return vtot;
 +}
 +
 +
 +real unimplemented(int gmx_unused nbonds,
 +                   const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
 +                   const rvec gmx_unused x[], rvec gmx_unused f[], rvec gmx_unused fshift[],
 +                   const t_pbc gmx_unused *pbc, const t_graph  gmx_unused *g,
 +                   real gmx_unused lambda, real gmx_unused *dvdlambda,
 +                   const t_mdatoms  gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                   int gmx_unused *global_atom_index)
 +{
 +    gmx_impl("*** you are using a not implemented function");
 +
 +    return 0.0; /* To make the compiler happy */
 +}
 +
 +real rbdihs(int nbonds,
 +            const t_iatom forceatoms[], const t_iparams forceparams[],
 +            const rvec x[], rvec f[], rvec fshift[],
 +            const t_pbc *pbc, const t_graph *g,
 +            real lambda, real *dvdlambda,
 +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +            int gmx_unused *global_atom_index)
 +{
 +    const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
 +    int        type, ai, aj, ak, al, i, j;
 +    int        t1, t2, t3;
 +    rvec       r_ij, r_kj, r_kl, m, n;
 +    real       parmA[NR_RBDIHS];
 +    real       parmB[NR_RBDIHS];
 +    real       parm[NR_RBDIHS];
 +    real       cos_phi, phi, rbp, rbpBA;
 +    real       v, sign, ddphi, sin_phi;
 +    real       cosfac, vtot;
 +    real       L1        = 1.0-lambda;
 +    real       dvdl_term = 0;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        al   = forceatoms[i++];
 +
 +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
 +                        &sign, &t1, &t2, &t3);  /*  84		*/
 +
 +        /* Change to polymer convention */
 +        if (phi < c0)
 +        {
 +            phi += M_PI;
 +        }
 +        else
 +        {
 +            phi -= M_PI;    /*   1		*/
 +
 +        }
 +        cos_phi = cos(phi);
 +        /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 +        sin_phi = sin(phi);
 +
 +        for (j = 0; (j < NR_RBDIHS); j++)
 +        {
 +            parmA[j] = forceparams[type].rbdihs.rbcA[j];
 +            parmB[j] = forceparams[type].rbdihs.rbcB[j];
 +            parm[j]  = L1*parmA[j]+lambda*parmB[j];
 +        }
 +        /* Calculate cosine powers */
 +        /* Calculate the energy */
 +        /* Calculate the derivative */
 +
 +        v            = parm[0];
 +        dvdl_term   += (parmB[0]-parmA[0]);
 +        ddphi        = c0;
 +        cosfac       = c1;
 +
 +        rbp          = parm[1];
 +        rbpBA        = parmB[1]-parmA[1];
 +        ddphi       += rbp*cosfac;
 +        cosfac      *= cos_phi;
 +        v           += cosfac*rbp;
 +        dvdl_term   += cosfac*rbpBA;
 +        rbp          = parm[2];
 +        rbpBA        = parmB[2]-parmA[2];
 +        ddphi       += c2*rbp*cosfac;
 +        cosfac      *= cos_phi;
 +        v           += cosfac*rbp;
 +        dvdl_term   += cosfac*rbpBA;
 +        rbp          = parm[3];
 +        rbpBA        = parmB[3]-parmA[3];
 +        ddphi       += c3*rbp*cosfac;
 +        cosfac      *= cos_phi;
 +        v           += cosfac*rbp;
 +        dvdl_term   += cosfac*rbpBA;
 +        rbp          = parm[4];
 +        rbpBA        = parmB[4]-parmA[4];
 +        ddphi       += c4*rbp*cosfac;
 +        cosfac      *= cos_phi;
 +        v           += cosfac*rbp;
 +        dvdl_term   += cosfac*rbpBA;
 +        rbp          = parm[5];
 +        rbpBA        = parmB[5]-parmA[5];
 +        ddphi       += c5*rbp*cosfac;
 +        cosfac      *= cos_phi;
 +        v           += cosfac*rbp;
 +        dvdl_term   += cosfac*rbpBA;
 +
 +        ddphi = -ddphi*sin_phi;         /*  11		*/
 +
 +        do_dih_fup(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
 +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112		*/
 +        vtot += v;
 +    }
 +    *dvdlambda += dvdl_term;
 +
 +    return vtot;
 +}
 +
 +int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
 +{
 +    int im1, ip1, ip2;
 +
 +    if (ip < 0)
 +    {
 +        ip = ip + grid_spacing - 1;
 +    }
 +    else if (ip > grid_spacing)
 +    {
 +        ip = ip - grid_spacing - 1;
 +    }
 +
 +    im1 = ip - 1;
 +    ip1 = ip + 1;
 +    ip2 = ip + 2;
 +
 +    if (ip == 0)
 +    {
 +        im1 = grid_spacing - 1;
 +    }
 +    else if (ip == grid_spacing-2)
 +    {
 +        ip2 = 0;
 +    }
 +    else if (ip == grid_spacing-1)
 +    {
 +        ip1 = 0;
 +        ip2 = 1;
 +    }
 +
 +    *ipm1 = im1;
 +    *ipp1 = ip1;
 +    *ipp2 = ip2;
 +
 +    return ip;
 +
 +}
 +
 +real cmap_dihs(int nbonds,
 +               const t_iatom forceatoms[], const t_iparams forceparams[],
 +               const gmx_cmap_t *cmap_grid,
 +               const rvec x[], rvec f[], rvec fshift[],
 +               const t_pbc *pbc, const t_graph *g,
 +               real gmx_unused lambda, real gmx_unused *dvdlambda,
 +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +               int  gmx_unused *global_atom_index)
 +{
 +    int         i, j, k, n, idx;
 +    int         ai, aj, ak, al, am;
 +    int         a1i, a1j, a1k, a1l, a2i, a2j, a2k, a2l;
 +    int         type, cmapA;
 +    int         t11, t21, t31, t12, t22, t32;
 +    int         iphi1, ip1m1, ip1p1, ip1p2;
 +    int         iphi2, ip2m1, ip2p1, ip2p2;
 +    int         l1, l2, l3, l4;
 +    int         pos1, pos2, pos3, pos4, tmp;
 +
 +    real        ty[4], ty1[4], ty2[4], ty12[4], tc[16], tx[16];
 +    real        phi1, psi1, cos_phi1, sin_phi1, sign1, xphi1;
 +    real        phi2, psi2, cos_phi2, sin_phi2, sign2, xphi2;
 +    real        dx, xx, tt, tu, e, df1, df2, ddf1, ddf2, ddf12, vtot;
 +    real        ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
 +    real        ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
 +    real        fg1, hg1, fga1, hgb1, gaa1, gbb1;
 +    real        fg2, hg2, fga2, hgb2, gaa2, gbb2;
 +    real        fac;
 +
 +    rvec        r1_ij, r1_kj, r1_kl, m1, n1;
 +    rvec        r2_ij, r2_kj, r2_kl, m2, n2;
 +    rvec        f1_i, f1_j, f1_k, f1_l;
 +    rvec        f2_i, f2_j, f2_k, f2_l;
 +    rvec        a1, b1, a2, b2;
 +    rvec        f1, g1, h1, f2, g2, h2;
 +    rvec        dtf1, dtg1, dth1, dtf2, dtg2, dth2;
 +    ivec        jt1, dt1_ij, dt1_kj, dt1_lj;
 +    ivec        jt2, dt2_ij, dt2_kj, dt2_lj;
 +
 +    const real *cmapd;
 +
 +    int         loop_index[4][4] = {
 +        {0, 4, 8, 12},
 +        {1, 5, 9, 13},
 +        {2, 6, 10, 14},
 +        {3, 7, 11, 15}
 +    };
 +
 +    /* Total CMAP energy */
 +    vtot = 0;
 +
 +    for (n = 0; n < nbonds; )
 +    {
 +        /* Five atoms are involved in the two torsions */
 +        type   = forceatoms[n++];
 +        ai     = forceatoms[n++];
 +        aj     = forceatoms[n++];
 +        ak     = forceatoms[n++];
 +        al     = forceatoms[n++];
 +        am     = forceatoms[n++];
 +
 +        /* Which CMAP type is this */
 +        cmapA = forceparams[type].cmap.cmapA;
 +        cmapd = cmap_grid->cmapdata[cmapA].cmap;
 +
 +        /* First torsion */
 +        a1i   = ai;
 +        a1j   = aj;
 +        a1k   = ak;
 +        a1l   = al;
 +
 +        phi1  = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
 +                          &sign1, &t11, &t21, &t31);  /* 84 */
 +
 +        cos_phi1 = cos(phi1);
 +
 +        a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
 +        a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
 +        a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
 +
 +        b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
 +        b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
 +        b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
 +
 +        tmp = pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
 +
 +        ra21  = iprod(a1, a1);       /* 5 */
 +        rb21  = iprod(b1, b1);       /* 5 */
 +        rg21  = iprod(r1_kj, r1_kj); /* 5 */
 +        rg1   = sqrt(rg21);
 +
 +        rgr1  = 1.0/rg1;
 +        ra2r1 = 1.0/ra21;
 +        rb2r1 = 1.0/rb21;
 +        rabr1 = sqrt(ra2r1*rb2r1);
 +
 +        sin_phi1 = rg1 * rabr1 * iprod(a1, h1) * (-1);
 +
 +        if (cos_phi1 < -0.5 || cos_phi1 > 0.5)
 +        {
 +            phi1 = asin(sin_phi1);
 +
 +            if (cos_phi1 < 0)
 +            {
 +                if (phi1 > 0)
 +                {
 +                    phi1 = M_PI - phi1;
 +                }
 +                else
 +                {
 +                    phi1 = -M_PI - phi1;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            phi1 = acos(cos_phi1);
 +
 +            if (sin_phi1 < 0)
 +            {
 +                phi1 = -phi1;
 +            }
 +        }
 +
 +        xphi1 = phi1 + M_PI; /* 1 */
 +
 +        /* Second torsion */
 +        a2i   = aj;
 +        a2j   = ak;
 +        a2k   = al;
 +        a2l   = am;
 +
 +        phi2  = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
 +                          &sign2, &t12, &t22, &t32); /* 84 */
 +
 +        cos_phi2 = cos(phi2);
 +
 +        a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
 +        a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
 +        a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
 +
 +        b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
 +        b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
 +        b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
 +
 +        tmp = pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
 +
 +        ra22  = iprod(a2, a2);         /* 5 */
 +        rb22  = iprod(b2, b2);         /* 5 */
 +        rg22  = iprod(r2_kj, r2_kj);   /* 5 */
 +        rg2   = sqrt(rg22);
 +
 +        rgr2  = 1.0/rg2;
 +        ra2r2 = 1.0/ra22;
 +        rb2r2 = 1.0/rb22;
 +        rabr2 = sqrt(ra2r2*rb2r2);
 +
 +        sin_phi2 = rg2 * rabr2 * iprod(a2, h2) * (-1);
 +
 +        if (cos_phi2 < -0.5 || cos_phi2 > 0.5)
 +        {
 +            phi2 = asin(sin_phi2);
 +
 +            if (cos_phi2 < 0)
 +            {
 +                if (phi2 > 0)
 +                {
 +                    phi2 = M_PI - phi2;
 +                }
 +                else
 +                {
 +                    phi2 = -M_PI - phi2;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            phi2 = acos(cos_phi2);
 +
 +            if (sin_phi2 < 0)
 +            {
 +                phi2 = -phi2;
 +            }
 +        }
 +
 +        xphi2 = phi2 + M_PI; /* 1 */
 +
 +        /* Range mangling */
 +        if (xphi1 < 0)
 +        {
 +            xphi1 = xphi1 + 2*M_PI;
 +        }
 +        else if (xphi1 >= 2*M_PI)
 +        {
 +            xphi1 = xphi1 - 2*M_PI;
 +        }
 +
 +        if (xphi2 < 0)
 +        {
 +            xphi2 = xphi2 + 2*M_PI;
 +        }
 +        else if (xphi2 >= 2*M_PI)
 +        {
 +            xphi2 = xphi2 - 2*M_PI;
 +        }
 +
 +        /* Number of grid points */
 +        dx = 2*M_PI / cmap_grid->grid_spacing;
 +
 +        /* Where on the grid are we */
 +        iphi1 = (int)(xphi1/dx);
 +        iphi2 = (int)(xphi2/dx);
 +
 +        iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
 +        iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
 +
 +        pos1    = iphi1*cmap_grid->grid_spacing+iphi2;
 +        pos2    = ip1p1*cmap_grid->grid_spacing+iphi2;
 +        pos3    = ip1p1*cmap_grid->grid_spacing+ip2p1;
 +        pos4    = iphi1*cmap_grid->grid_spacing+ip2p1;
 +
 +        ty[0]   = cmapd[pos1*4];
 +        ty[1]   = cmapd[pos2*4];
 +        ty[2]   = cmapd[pos3*4];
 +        ty[3]   = cmapd[pos4*4];
 +
 +        ty1[0]   = cmapd[pos1*4+1];
 +        ty1[1]   = cmapd[pos2*4+1];
 +        ty1[2]   = cmapd[pos3*4+1];
 +        ty1[3]   = cmapd[pos4*4+1];
 +
 +        ty2[0]   = cmapd[pos1*4+2];
 +        ty2[1]   = cmapd[pos2*4+2];
 +        ty2[2]   = cmapd[pos3*4+2];
 +        ty2[3]   = cmapd[pos4*4+2];
 +
 +        ty12[0]   = cmapd[pos1*4+3];
 +        ty12[1]   = cmapd[pos2*4+3];
 +        ty12[2]   = cmapd[pos3*4+3];
 +        ty12[3]   = cmapd[pos4*4+3];
 +
 +        /* Switch to degrees */
 +        dx    = 360.0 / cmap_grid->grid_spacing;
 +        xphi1 = xphi1 * RAD2DEG;
 +        xphi2 = xphi2 * RAD2DEG;
 +
 +        for (i = 0; i < 4; i++) /* 16 */
 +        {
 +            tx[i]    = ty[i];
 +            tx[i+4]  = ty1[i]*dx;
 +            tx[i+8]  = ty2[i]*dx;
 +            tx[i+12] = ty12[i]*dx*dx;
 +        }
 +
 +        idx = 0;
 +        for (i = 0; i < 4; i++) /* 1056 */
 +        {
 +            for (j = 0; j < 4; j++)
 +            {
 +                xx = 0;
 +                for (k = 0; k < 16; k++)
 +                {
 +                    xx = xx + cmap_coeff_matrix[k*16+idx]*tx[k];
 +                }
 +
 +                idx++;
 +                tc[i*4+j] = xx;
 +            }
 +        }
 +
 +        tt    = (xphi1-iphi1*dx)/dx;
 +        tu    = (xphi2-iphi2*dx)/dx;
 +
 +        e     = 0;
 +        df1   = 0;
 +        df2   = 0;
 +        ddf1  = 0;
 +        ddf2  = 0;
 +        ddf12 = 0;
 +
 +        for (i = 3; i >= 0; i--)
 +        {
 +            l1 = loop_index[i][3];
 +            l2 = loop_index[i][2];
 +            l3 = loop_index[i][1];
 +
 +            e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
 +            df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
 +            df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
 +            ddf1  = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
 +            ddf2  = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
 +        }
 +
 +        ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
 +            3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
 +
 +        fac     = RAD2DEG/dx;
 +        df1     = df1   * fac;
 +        df2     = df2   * fac;
 +        ddf1    = ddf1  * fac * fac;
 +        ddf2    = ddf2  * fac * fac;
 +        ddf12   = ddf12 * fac * fac;
 +
 +        /* CMAP energy */
 +        vtot += e;
 +
 +        /* Do forces - first torsion */
 +        fg1       = iprod(r1_ij, r1_kj);
 +        hg1       = iprod(r1_kl, r1_kj);
 +        fga1      = fg1*ra2r1*rgr1;
 +        hgb1      = hg1*rb2r1*rgr1;
 +        gaa1      = -ra2r1*rg1;
 +        gbb1      = rb2r1*rg1;
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            dtf1[i]   = gaa1 * a1[i];
 +            dtg1[i]   = fga1 * a1[i] - hgb1 * b1[i];
 +            dth1[i]   = gbb1 * b1[i];
 +
 +            f1[i]     = df1  * dtf1[i];
 +            g1[i]     = df1  * dtg1[i];
 +            h1[i]     = df1  * dth1[i];
 +
 +            f1_i[i]   =  f1[i];
 +            f1_j[i]   = -f1[i] - g1[i];
 +            f1_k[i]   =  h1[i] + g1[i];
 +            f1_l[i]   = -h1[i];
 +
 +            f[a1i][i] = f[a1i][i] + f1_i[i];
 +            f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */
 +            f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */
 +            f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */
 +        }
 +
 +        /* Do forces - second torsion */
 +        fg2       = iprod(r2_ij, r2_kj);
 +        hg2       = iprod(r2_kl, r2_kj);
 +        fga2      = fg2*ra2r2*rgr2;
 +        hgb2      = hg2*rb2r2*rgr2;
 +        gaa2      = -ra2r2*rg2;
 +        gbb2      = rb2r2*rg2;
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            dtf2[i]   = gaa2 * a2[i];
 +            dtg2[i]   = fga2 * a2[i] - hgb2 * b2[i];
 +            dth2[i]   = gbb2 * b2[i];
 +
 +            f2[i]     = df2  * dtf2[i];
 +            g2[i]     = df2  * dtg2[i];
 +            h2[i]     = df2  * dth2[i];
 +
 +            f2_i[i]   =  f2[i];
 +            f2_j[i]   = -f2[i] - g2[i];
 +            f2_k[i]   =  h2[i] + g2[i];
 +            f2_l[i]   = -h2[i];
 +
 +            f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */
 +            f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */
 +            f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */
 +            f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */
 +        }
 +
 +        /* Shift forces */
 +        if (g)
 +        {
 +            copy_ivec(SHIFT_IVEC(g, a1j), jt1);
 +            ivec_sub(SHIFT_IVEC(g, a1i),  jt1, dt1_ij);
 +            ivec_sub(SHIFT_IVEC(g, a1k),  jt1, dt1_kj);
 +            ivec_sub(SHIFT_IVEC(g, a1l),  jt1, dt1_lj);
 +            t11 = IVEC2IS(dt1_ij);
 +            t21 = IVEC2IS(dt1_kj);
 +            t31 = IVEC2IS(dt1_lj);
 +
 +            copy_ivec(SHIFT_IVEC(g, a2j), jt2);
 +            ivec_sub(SHIFT_IVEC(g, a2i),  jt2, dt2_ij);
 +            ivec_sub(SHIFT_IVEC(g, a2k),  jt2, dt2_kj);
 +            ivec_sub(SHIFT_IVEC(g, a2l),  jt2, dt2_lj);
 +            t12 = IVEC2IS(dt2_ij);
 +            t22 = IVEC2IS(dt2_kj);
 +            t32 = IVEC2IS(dt2_lj);
 +        }
 +        else if (pbc)
 +        {
 +            t31 = pbc_rvec_sub(pbc, x[a1l], x[a1j], h1);
 +            t32 = pbc_rvec_sub(pbc, x[a2l], x[a2j], h2);
 +        }
 +        else
 +        {
 +            t31 = CENTRAL;
 +            t32 = CENTRAL;
 +        }
 +
 +        rvec_inc(fshift[t11], f1_i);
 +        rvec_inc(fshift[CENTRAL], f1_j);
 +        rvec_inc(fshift[t21], f1_k);
 +        rvec_inc(fshift[t31], f1_l);
 +
 +        rvec_inc(fshift[t21], f2_i);
 +        rvec_inc(fshift[CENTRAL], f2_j);
 +        rvec_inc(fshift[t22], f2_k);
 +        rvec_inc(fshift[t32], f2_l);
 +    }
 +    return vtot;
 +}
 +
 +
 +
 +/***********************************************************
 + *
 + *   G R O M O S  9 6   F U N C T I O N S
 + *
 + ***********************************************************/
 +real g96harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
 +                 real *V, real *F)
 +{
 +    const real half = 0.5;
 +    real       L1, kk, x0, dx, dx2;
 +    real       v, f, dvdlambda;
 +
 +    L1    = 1.0-lambda;
 +    kk    = L1*kA+lambda*kB;
 +    x0    = L1*xA+lambda*xB;
 +
 +    dx    = x-x0;
 +    dx2   = dx*dx;
 +
 +    f          = -kk*dx;
 +    v          = half*kk*dx2;
 +    dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
 +
 +    *F    = f;
 +    *V    = v;
 +
 +    return dvdlambda;
 +
 +    /* That was 21 flops */
 +}
 +
 +real g96bonds(int nbonds,
 +              const t_iatom forceatoms[], const t_iparams forceparams[],
 +              const rvec x[], rvec f[], rvec fshift[],
 +              const t_pbc *pbc, const t_graph *g,
 +              real lambda, real *dvdlambda,
 +              const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +              int gmx_unused *global_atom_index)
 +{
 +    int  i, m, ki, ai, aj, type;
 +    real dr2, fbond, vbond, fij, vtot;
 +    rvec dx;
 +    ivec dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
 +        dr2  = iprod(dx, dx);                       /*   5		*/
 +
 +        *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
 +                                  forceparams[type].harmonic.krB,
 +                                  forceparams[type].harmonic.rA,
 +                                  forceparams[type].harmonic.rB,
 +                                  dr2, lambda, &vbond, &fbond);
 +
 +        vtot  += 0.5*vbond;                         /* 1*/
 +#ifdef DEBUG
 +        if (debug)
 +        {
 +            fprintf(debug, "G96-BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +                    sqrt(dr2), vbond, fbond);
 +        }
 +#endif
 +
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)     /*  15		*/
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }               /* 44 TOTAL	*/
 +    return vtot;
 +}
 +
 +real g96bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
 +                   rvec r_ij, rvec r_kj,
 +                   int *t1, int *t2)
 +/* Return value is the angle between the bonds i-j and j-k */
 +{
 +    real costh;
 +
 +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3		*/
 +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3		*/
 +
 +    costh = cos_angle(r_ij, r_kj);         /* 25		*/
 +    /* 41 TOTAL	*/
 +    return costh;
 +}
 +
 +real g96angles(int nbonds,
 +               const t_iatom forceatoms[], const t_iparams forceparams[],
 +               const rvec x[], rvec f[], rvec fshift[],
 +               const t_pbc *pbc, const t_graph *g,
 +               real lambda, real *dvdlambda,
 +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +               int gmx_unused *global_atom_index)
 +{
 +    int  i, ai, aj, ak, type, m, t1, t2;
 +    rvec r_ij, r_kj;
 +    real cos_theta, dVdt, va, vtot;
 +    real rij_1, rij_2, rkj_1, rkj_2, rijrkj_1;
 +    rvec f_i, f_j, f_k;
 +    ivec jt, dt_ij, dt_kj;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +
 +        cos_theta  = g96bond_angle(x[ai], x[aj], x[ak], pbc, r_ij, r_kj, &t1, &t2);
 +
 +        *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
 +                                  forceparams[type].harmonic.krB,
 +                                  forceparams[type].harmonic.rA,
 +                                  forceparams[type].harmonic.rB,
 +                                  cos_theta, lambda, &va, &dVdt);
 +        vtot    += va;
 +
 +        rij_1    = gmx_invsqrt(iprod(r_ij, r_ij));
 +        rkj_1    = gmx_invsqrt(iprod(r_kj, r_kj));
 +        rij_2    = rij_1*rij_1;
 +        rkj_2    = rkj_1*rkj_1;
 +        rijrkj_1 = rij_1*rkj_1;                 /* 23 */
 +
 +#ifdef DEBUG
 +        if (debug)
 +        {
 +            fprintf(debug, "G96ANGLES: costheta = %10g  vth = %10g  dV/dct = %10g\n",
 +                    cos_theta, va, dVdt);
 +        }
 +#endif
 +        for (m = 0; (m < DIM); m++)     /*  42	*/
 +        {
 +            f_i[m]    = dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
 +            f_k[m]    = dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
 +            f_j[m]    = -f_i[m]-f_k[m];
 +            f[ai][m] += f_i[m];
 +            f[aj][m] += f_j[m];
 +            f[ak][m] += f_k[m];
 +        }
 +
 +        if (g)
 +        {
 +            copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +            t1 = IVEC2IS(dt_ij);
 +            t2 = IVEC2IS(dt_kj);
 +        }
 +        rvec_inc(fshift[t1], f_i);
 +        rvec_inc(fshift[CENTRAL], f_j);
 +        rvec_inc(fshift[t2], f_k);          /* 9 */
 +        /* 163 TOTAL	*/
 +    }
 +    return vtot;
 +}
 +
 +real cross_bond_bond(int nbonds,
 +                     const t_iatom forceatoms[], const t_iparams forceparams[],
 +                     const rvec x[], rvec f[], rvec fshift[],
 +                     const t_pbc *pbc, const t_graph *g,
 +                     real gmx_unused lambda, real gmx_unused *dvdlambda,
 +                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused  *fcd,
 +                     int gmx_unused *global_atom_index)
 +{
 +    /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
 +     * pp. 842-847
 +     */
 +    int  i, ai, aj, ak, type, m, t1, t2;
 +    rvec r_ij, r_kj;
 +    real vtot, vrr, s1, s2, r1, r2, r1e, r2e, krr;
 +    rvec f_i, f_j, f_k;
 +    ivec jt, dt_ij, dt_kj;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        r1e  = forceparams[type].cross_bb.r1e;
 +        r2e  = forceparams[type].cross_bb.r2e;
 +        krr  = forceparams[type].cross_bb.krr;
 +
 +        /* Compute distance vectors ... */
 +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
 +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
 +
 +        /* ... and their lengths */
 +        r1 = norm(r_ij);
 +        r2 = norm(r_kj);
 +
 +        /* Deviations from ideality */
 +        s1 = r1-r1e;
 +        s2 = r2-r2e;
 +
 +        /* Energy (can be negative!) */
 +        vrr   = krr*s1*s2;
 +        vtot += vrr;
 +
 +        /* Forces */
 +        svmul(-krr*s2/r1, r_ij, f_i);
 +        svmul(-krr*s1/r2, r_kj, f_k);
 +
 +        for (m = 0; (m < DIM); m++)     /*  12	*/
 +        {
 +            f_j[m]    = -f_i[m] - f_k[m];
 +            f[ai][m] += f_i[m];
 +            f[aj][m] += f_j[m];
 +            f[ak][m] += f_k[m];
 +        }
 +
 +        /* Virial stuff */
 +        if (g)
 +        {
 +            copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +            t1 = IVEC2IS(dt_ij);
 +            t2 = IVEC2IS(dt_kj);
 +        }
 +        rvec_inc(fshift[t1], f_i);
 +        rvec_inc(fshift[CENTRAL], f_j);
 +        rvec_inc(fshift[t2], f_k);          /* 9 */
 +        /* 163 TOTAL	*/
 +    }
 +    return vtot;
 +}
 +
 +real cross_bond_angle(int nbonds,
 +                      const t_iatom forceatoms[], const t_iparams forceparams[],
 +                      const rvec x[], rvec f[], rvec fshift[],
 +                      const t_pbc *pbc, const t_graph *g,
 +                      real gmx_unused lambda, real gmx_unused *dvdlambda,
 +                      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
 +                      int gmx_unused *global_atom_index)
 +{
 +    /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
 +     * pp. 842-847
 +     */
 +    int  i, ai, aj, ak, type, m, t1, t2, t3;
 +    rvec r_ij, r_kj, r_ik;
 +    real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
 +    rvec f_i, f_j, f_k;
 +    ivec jt, dt_ij, dt_kj;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        r1e  = forceparams[type].cross_ba.r1e;
 +        r2e  = forceparams[type].cross_ba.r2e;
 +        r3e  = forceparams[type].cross_ba.r3e;
 +        krt  = forceparams[type].cross_ba.krt;
 +
 +        /* Compute distance vectors ... */
 +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
 +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
 +        t3 = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
 +
 +        /* ... and their lengths */
 +        r1 = norm(r_ij);
 +        r2 = norm(r_kj);
 +        r3 = norm(r_ik);
 +
 +        /* Deviations from ideality */
 +        s1 = r1-r1e;
 +        s2 = r2-r2e;
 +        s3 = r3-r3e;
 +
 +        /* Energy (can be negative!) */
 +        vrt   = krt*s3*(s1+s2);
 +        vtot += vrt;
 +
 +        /* Forces */
 +        k1 = -krt*(s3/r1);
 +        k2 = -krt*(s3/r2);
 +        k3 = -krt*(s1+s2)/r3;
 +        for (m = 0; (m < DIM); m++)
 +        {
 +            f_i[m] = k1*r_ij[m] + k3*r_ik[m];
 +            f_k[m] = k2*r_kj[m] - k3*r_ik[m];
 +            f_j[m] = -f_i[m] - f_k[m];
 +        }
 +
 +        for (m = 0; (m < DIM); m++)     /*  12	*/
 +        {
 +            f[ai][m] += f_i[m];
 +            f[aj][m] += f_j[m];
 +            f[ak][m] += f_k[m];
 +        }
 +
 +        /* Virial stuff */
 +        if (g)
 +        {
 +            copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +            ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +            ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +            t1 = IVEC2IS(dt_ij);
 +            t2 = IVEC2IS(dt_kj);
 +        }
 +        rvec_inc(fshift[t1], f_i);
 +        rvec_inc(fshift[CENTRAL], f_j);
 +        rvec_inc(fshift[t2], f_k);          /* 9 */
 +        /* 163 TOTAL	*/
 +    }
 +    return vtot;
 +}
 +
 +static real bonded_tab(const char *type, int table_nr,
 +                       const bondedtable_t *table, real kA, real kB, real r,
 +                       real lambda, real *V, real *F)
 +{
 +    real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
 +    int  n0, nnn;
 +    real v, f, dvdlambda;
 +
 +    k = (1.0 - lambda)*kA + lambda*kB;
 +
 +    tabscale = table->scale;
 +    VFtab    = table->data;
 +
 +    rt    = r*tabscale;
 +    n0    = rt;
 +    if (n0 >= table->n)
 +    {
 +        gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
 +                  type, table_nr, r, n0, n0+1, table->n);
 +    }
 +    eps   = rt - n0;
 +    eps2  = eps*eps;
 +    nnn   = 4*n0;
 +    Yt    = VFtab[nnn];
 +    Ft    = VFtab[nnn+1];
 +    Geps  = VFtab[nnn+2]*eps;
 +    Heps2 = VFtab[nnn+3]*eps2;
 +    Fp    = Ft + Geps + Heps2;
 +    VV    = Yt + Fp*eps;
 +    FF    = Fp + Geps + 2.0*Heps2;
 +
 +    *F         = -k*FF*tabscale;
 +    *V         = k*VV;
 +    dvdlambda  = (kB - kA)*VV;
 +
 +    return dvdlambda;
 +
 +    /* That was 22 flops */
 +}
 +
 +real tab_bonds(int nbonds,
 +               const t_iatom forceatoms[], const t_iparams forceparams[],
 +               const rvec x[], rvec f[], rvec fshift[],
 +               const t_pbc *pbc, const t_graph *g,
 +               real lambda, real *dvdlambda,
 +               const t_mdatoms gmx_unused *md, t_fcdata *fcd,
 +               int gmx_unused  *global_atom_index)
 +{
 +    int  i, m, ki, ai, aj, type, table;
 +    real dr, dr2, fbond, vbond, fij, vtot;
 +    rvec dx;
 +    ivec dt;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +
 +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
 +        dr2  = iprod(dx, dx);                       /*   5		*/
 +        dr   = dr2*gmx_invsqrt(dr2);                /*  10		*/
 +
 +        table = forceparams[type].tab.table;
 +
 +        *dvdlambda += bonded_tab("bond", table,
 +                                 &fcd->bondtab[table],
 +                                 forceparams[type].tab.kA,
 +                                 forceparams[type].tab.kB,
 +                                 dr, lambda, &vbond, &fbond); /*  22 */
 +
 +        if (dr2 == 0.0)
 +        {
 +            continue;
 +        }
 +
 +
 +        vtot  += vbond;            /* 1*/
 +        fbond *= gmx_invsqrt(dr2); /*   6		*/
 +#ifdef DEBUG
 +        if (debug)
 +        {
 +            fprintf(debug, "TABBONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +                    dr, vbond, fbond);
 +        }
 +#endif
 +        if (g)
 +        {
 +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
 +            ki = IVEC2IS(dt);
 +        }
 +        for (m = 0; (m < DIM); m++)     /*  15		*/
 +        {
 +            fij                 = fbond*dx[m];
 +            f[ai][m]           += fij;
 +            f[aj][m]           -= fij;
 +            fshift[ki][m]      += fij;
 +            fshift[CENTRAL][m] -= fij;
 +        }
 +    }               /* 62 TOTAL	*/
 +    return vtot;
 +}
 +
 +real tab_angles(int nbonds,
 +                const t_iatom forceatoms[], const t_iparams forceparams[],
 +                const rvec x[], rvec f[], rvec fshift[],
 +                const t_pbc *pbc, const t_graph *g,
 +                real lambda, real *dvdlambda,
 +                const t_mdatoms gmx_unused  *md, t_fcdata *fcd,
 +                int gmx_unused *global_atom_index)
 +{
 +    int  i, ai, aj, ak, t1, t2, type, table;
 +    rvec r_ij, r_kj;
 +    real cos_theta, cos_theta2, theta, dVdt, va, vtot;
 +    ivec jt, dt_ij, dt_kj;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +
 +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
 +                            r_ij, r_kj, &cos_theta, &t1, &t2); /*  41		*/
 +
 +        table = forceparams[type].tab.table;
 +
 +        *dvdlambda += bonded_tab("angle", table,
 +                                 &fcd->angletab[table],
 +                                 forceparams[type].tab.kA,
 +                                 forceparams[type].tab.kB,
 +                                 theta, lambda, &va, &dVdt); /*  22  */
 +        vtot += va;
 +
 +        cos_theta2 = sqr(cos_theta);            /*   1		*/
 +        if (cos_theta2 < 1)
 +        {
 +            int  m;
 +            real snt, st, sth;
 +            real cik, cii, ckk;
 +            real nrkj2, nrij2;
 +            rvec f_i, f_j, f_k;
 +
 +            st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12		*/
 +            sth = st*cos_theta;                     /*   1		*/
 +#ifdef DEBUG
 +            if (debug)
 +            {
 +                fprintf(debug, "ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +                        theta*RAD2DEG, va, dVdt);
 +            }
 +#endif
 +            nrkj2 = iprod(r_kj, r_kj);  /*   5		*/
 +            nrij2 = iprod(r_ij, r_ij);
 +
 +            cik = st*gmx_invsqrt(nrkj2*nrij2); /*  12		*/
 +            cii = sth/nrij2;                   /*  10		*/
 +            ckk = sth/nrkj2;                   /*  10		*/
 +
 +            for (m = 0; (m < DIM); m++)        /*  39		*/
 +            {
 +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
 +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
 +                f_j[m]    = -f_i[m]-f_k[m];
 +                f[ai][m] += f_i[m];
 +                f[aj][m] += f_j[m];
 +                f[ak][m] += f_k[m];
 +            }
 +            if (g)
 +            {
 +                copy_ivec(SHIFT_IVEC(g, aj), jt);
 +
 +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
 +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
 +                t1 = IVEC2IS(dt_ij);
 +                t2 = IVEC2IS(dt_kj);
 +            }
 +            rvec_inc(fshift[t1], f_i);
 +            rvec_inc(fshift[CENTRAL], f_j);
 +            rvec_inc(fshift[t2], f_k);
 +        }                                       /* 169 TOTAL	*/
 +    }
 +    return vtot;
 +}
 +
 +real tab_dihs(int nbonds,
 +              const t_iatom forceatoms[], const t_iparams forceparams[],
 +              const rvec x[], rvec f[], rvec fshift[],
 +              const t_pbc *pbc, const t_graph *g,
 +              real lambda, real *dvdlambda,
 +              const t_mdatoms gmx_unused *md, t_fcdata *fcd,
 +              int gmx_unused *global_atom_index)
 +{
 +    int  i, type, ai, aj, ak, al, table;
 +    int  t1, t2, t3;
 +    rvec r_ij, r_kj, r_kl, m, n;
 +    real phi, sign, ddphi, vpd, vtot;
 +
 +    vtot = 0.0;
 +    for (i = 0; (i < nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        al   = forceatoms[i++];
 +
 +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
 +                        &sign, &t1, &t2, &t3);  /*  84  */
 +
 +        table = forceparams[type].tab.table;
 +
 +        /* Hopefully phi+M_PI never results in values < 0 */
 +        *dvdlambda += bonded_tab("dihedral", table,
 +                                 &fcd->dihtab[table],
 +                                 forceparams[type].tab.kA,
 +                                 forceparams[type].tab.kB,
 +                                 phi+M_PI, lambda, &vpd, &ddphi);
 +
 +        vtot += vpd;
 +        do_dih_fup(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
 +                   f, fshift, pbc, g, x, t1, t2, t3); /* 112	*/
 +
 +#ifdef DEBUG
 +        fprintf(debug, "pdih: (%d,%d,%d,%d) phi=%g\n",
 +                ai, aj, ak, al, phi);
 +#endif
 +    } /* 227 TOTAL  */
 +
 +    return vtot;
 +}
 +
++/* Return if this is a potential calculated in bondfree.c,
++ * i.e. an interaction that actually calculates a potential and
++ * works on multiple atoms (not e.g. a connection or a position restraint).
++ */
++static gmx_inline gmx_bool ftype_is_bonded_potential(int ftype)
++{
++    return
++        (interaction_function[ftype].flags & IF_BOND) &&
++        !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
++        (ftype < F_GB12 || ftype > F_GB14);
++}
++
++static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
++{
++    int ftype;
++    int nat1;
++    int t;
++    int il_nr_thread;
++
++    idef->nthreads = nthreads;
++
++    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
++    {
++        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
++        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
++    }
++
++    for (ftype = 0; ftype < F_NRE; ftype++)
++    {
++        if (ftype_is_bonded_potential(ftype))
++        {
++            nat1 = interaction_function[ftype].nratoms + 1;
++
++            for (t = 0; t <= nthreads; t++)
++            {
++                /* Divide the interactions equally over the threads.
++                 * When the different types of bonded interactions
++                 * are distributed roughly equally over the threads,
++                 * this should lead to well localized output into
++                 * the force buffer on each thread.
++                 * If this is not the case, a more advanced scheme
++                 * (not implemented yet) will do better.
++                 */
++                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
++
++                /* Ensure that distance restraint pairs with the same label
++                 * end up on the same thread.
++                 * This is slighlty tricky code, since the next for iteration
++                 * may have an initial il_nr_thread lower than the final value
++                 * in the previous iteration, but this will anyhow be increased
++                 * to the approriate value again by this while loop.
++                 */
++                while (ftype == F_DISRES &&
++                       il_nr_thread > 0 &&
++                       il_nr_thread < idef->il[ftype].nr &&
++                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
++                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
++                {
++                    il_nr_thread += nat1;
++                }
++
++                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
++            }
++        }
++    }
++}
++
 +static unsigned
 +calc_bonded_reduction_mask(const t_idef *idef,
 +                           int shift,
 +                           int t, int nt)
 +{
 +    unsigned mask;
 +    int      ftype, nb, nat1, nb0, nb1, i, a;
 +
 +    mask = 0;
 +
 +    for (ftype = 0; ftype < F_NRE; ftype++)
 +    {
-         if (interaction_function[ftype].flags & IF_BOND &&
-             !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
-             (ftype<F_GB12 || ftype>F_GB14))
++        if (ftype_is_bonded_potential(ftype))
 +        {
 +            nb = idef->il[ftype].nr;
 +            if (nb > 0)
 +            {
 +                nat1 = interaction_function[ftype].nratoms + 1;
 +
 +                /* Divide this interaction equally over the threads.
 +                 * This is not stored: should match division in calc_bonds.
 +                 */
-                 nb0 = (((nb/nat1)* t   )/nt)*nat1;
-                 nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
++                nb0 = idef->il_thread_division[ftype*(nt+1)+t];
++                nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
 +
 +                for (i = nb0; i < nb1; i += nat1)
 +                {
 +                    for (a = 1; a < nat1; a++)
 +                    {
 +                        mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return mask;
 +}
 +
- void init_bonded_thread_force_reduction(t_forcerec   *fr,
-                                         const t_idef *idef)
++void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
 +{
 +#define MAX_BLOCK_BITS 32
 +    int t;
 +    int ctot, c, b;
 +
-     if (fr->nthreads <= 1)
++    assert(fr->nthreads >= 1);
++
++    /* Divide the bonded interaction over the threads */
++    divide_bondeds_over_threads(idef, fr->nthreads);
++
++    if (fr->nthreads == 1)
 +    {
 +        fr->red_nblock = 0;
 +
 +        return;
 +    }
 +
 +    /* We divide the force array in a maximum of 32 blocks.
 +     * Minimum force block reduction size is 2^6=64.
 +     */
 +    fr->red_ashift = 6;
 +    while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
 +    {
 +        fr->red_ashift++;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "bonded force buffer block atom shift %d bits\n",
 +                fr->red_ashift);
 +    }
 +
 +    /* Determine to which blocks each thread's bonded force calculation
 +     * contributes. Store this is a mask for each thread.
 +     */
 +#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 +    for (t = 1; t < fr->nthreads; t++)
 +    {
 +        fr->f_t[t].red_mask =
 +            calc_bonded_reduction_mask(idef, fr->red_ashift, t, fr->nthreads);
 +    }
 +
 +    /* Determine the maximum number of blocks we need to reduce over */
 +    fr->red_nblock = 0;
 +    ctot           = 0;
 +    for (t = 0; t < fr->nthreads; t++)
 +    {
 +        c = 0;
 +        for (b = 0; b < MAX_BLOCK_BITS; b++)
 +        {
 +            if (fr->f_t[t].red_mask & (1U<<b))
 +            {
 +                fr->red_nblock = max(fr->red_nblock, b+1);
 +                c++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "thread %d flags %x count %d\n",
 +                    t, fr->f_t[t].red_mask, c);
 +        }
 +        ctot += c;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
 +                fr->red_nblock, 1<<fr->red_ashift);
 +        fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
 +                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
 +                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
 +    }
 +}
 +
 +static void zero_thread_forces(f_thread_t *f_t, int n,
 +                               int nblock, int blocksize)
 +{
 +    int b, a0, a1, a, i, j;
 +
 +    if (n > f_t->f_nalloc)
 +    {
 +        f_t->f_nalloc = over_alloc_large(n);
 +        srenew(f_t->f, f_t->f_nalloc);
 +    }
 +
 +    if (f_t->red_mask != 0)
 +    {
 +        for (b = 0; b < nblock; b++)
 +        {
 +            if (f_t->red_mask && (1U<<b))
 +            {
 +                a0 = b*blocksize;
 +                a1 = min((b+1)*blocksize, n);
 +                for (a = a0; a < a1; a++)
 +                {
 +                    clear_rvec(f_t->f[a]);
 +                }
 +            }
 +        }
 +    }
 +    for (i = 0; i < SHIFTS; i++)
 +    {
 +        clear_rvec(f_t->fshift[i]);
 +    }
 +    for (i = 0; i < F_NRE; i++)
 +    {
 +        f_t->ener[i] = 0;
 +    }
 +    for (i = 0; i < egNR; i++)
 +    {
 +        for (j = 0; j < f_t->grpp.nener; j++)
 +        {
 +            f_t->grpp.ener[i][j] = 0;
 +        }
 +    }
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        f_t->dvdl[i] = 0;
 +    }
 +}
 +
 +static void reduce_thread_force_buffer(int n, rvec *f,
 +                                       int nthreads, f_thread_t *f_t,
 +                                       int nblock, int block_size)
 +{
 +    /* The max thread number is arbitrary,
 +     * we used a fixed number to avoid memory management.
 +     * Using more than 16 threads is probably never useful performance wise.
 +     */
 +#define MAX_BONDED_THREADS 256
 +    int b;
 +
 +    if (nthreads > MAX_BONDED_THREADS)
 +    {
 +        gmx_fatal(FARGS, "Can not reduce bonded forces on more than %d threads",
 +                  MAX_BONDED_THREADS);
 +    }
 +
 +    /* This reduction can run on any number of threads,
 +     * independently of nthreads.
 +     */
 +#pragma omp parallel for num_threads(nthreads) schedule(static)
 +    for (b = 0; b < nblock; b++)
 +    {
 +        rvec *fp[MAX_BONDED_THREADS];
 +        int   nfb, ft, fb;
 +        int   a0, a1, a;
 +
 +        /* Determine which threads contribute to this block */
 +        nfb = 0;
 +        for (ft = 1; ft < nthreads; ft++)
 +        {
 +            if (f_t[ft].red_mask & (1U<<b))
 +            {
 +                fp[nfb++] = f_t[ft].f;
 +            }
 +        }
 +        if (nfb > 0)
 +        {
 +            /* Reduce force buffers for threads that contribute */
 +            a0 =  b   *block_size;
 +            a1 = (b+1)*block_size;
 +            a1 = min(a1, n);
 +            for (a = a0; a < a1; a++)
 +            {
 +                for (fb = 0; fb < nfb; fb++)
 +                {
 +                    rvec_inc(f[a], fp[fb][a]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void reduce_thread_forces(int n, rvec *f, rvec *fshift,
 +                                 real *ener, gmx_grppairener_t *grpp, real *dvdl,
 +                                 int nthreads, f_thread_t *f_t,
 +                                 int nblock, int block_size,
 +                                 gmx_bool bCalcEnerVir,
 +                                 gmx_bool bDHDL)
 +{
 +    if (nblock > 0)
 +    {
 +        /* Reduce the bonded force buffer */
 +        reduce_thread_force_buffer(n, f, nthreads, f_t, nblock, block_size);
 +    }
 +
 +    /* When necessary, reduce energy and virial using one thread only */
 +    if (bCalcEnerVir)
 +    {
 +        int t, i, j;
 +
 +        for (i = 0; i < SHIFTS; i++)
 +        {
 +            for (t = 1; t < nthreads; t++)
 +            {
 +                rvec_inc(fshift[i], f_t[t].fshift[i]);
 +            }
 +        }
 +        for (i = 0; i < F_NRE; i++)
 +        {
 +            for (t = 1; t < nthreads; t++)
 +            {
 +                ener[i] += f_t[t].ener[i];
 +            }
 +        }
 +        for (i = 0; i < egNR; i++)
 +        {
 +            for (j = 0; j < f_t[1].grpp.nener; j++)
 +            {
 +                for (t = 1; t < nthreads; t++)
 +                {
 +
 +                    grpp->ener[i][j] += f_t[t].grpp.ener[i][j];
 +                }
 +            }
 +        }
 +        if (bDHDL)
 +        {
 +            for (i = 0; i < efptNR; i++)
 +            {
 +
 +                for (t = 1; t < nthreads; t++)
 +                {
 +                    dvdl[i] += f_t[t].dvdl[i];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static real calc_one_bond(FILE *fplog, int thread,
 +                          int ftype, const t_idef *idef,
 +                          rvec x[], rvec f[], rvec fshift[],
 +                          t_forcerec *fr,
 +                          const t_pbc *pbc, const t_graph *g,
-                           gmx_enerdata_t gmx_unused *enerd, gmx_grppairener_t *grpp,
++                          gmx_grppairener_t *grpp,
 +                          t_nrnb *nrnb,
 +                          real *lambda, real *dvdl,
 +                          const t_mdatoms *md, t_fcdata *fcd,
 +                          gmx_bool bCalcEnerVir,
 +                          int *global_atom_index, gmx_bool bPrintSepPot)
 +{
-     int      ind, nat1, nbonds, efptFTYPE;
++    int      nat1, nbonds, efptFTYPE;
 +    real     v = 0;
 +    t_iatom *iatoms;
 +    int      nb0, nbn;
 +
 +    if (IS_RESTRAINT_TYPE(ftype))
 +    {
 +        efptFTYPE = efptRESTRAINT;
 +    }
 +    else
 +    {
 +        efptFTYPE = efptBONDED;
 +    }
 +
-     if (interaction_function[ftype].flags & IF_BOND &&
-         !(ftype == F_CONNBONDS || ftype == F_POSRES))
-     {
-         ind       = interaction_function[ftype].nrnb_ind;
-         nat1      = interaction_function[ftype].nratoms + 1;
-         nbonds    = idef->il[ftype].nr/nat1;
-         iatoms    = idef->il[ftype].iatoms;
++    nat1      = interaction_function[ftype].nratoms + 1;
++    nbonds    = idef->il[ftype].nr/nat1;
++    iatoms    = idef->il[ftype].iatoms;
 +
-         nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
-         nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
++    nb0 = idef->il_thread_division[ftype*(idef->nthreads+1)+thread];
++    nbn = idef->il_thread_division[ftype*(idef->nthreads+1)+thread+1] - nb0;
 +
-         if (!IS_LISTED_LJ_C(ftype))
++    if (!IS_LISTED_LJ_C(ftype))
++    {
++        if (ftype == F_CMAP)
 +        {
-             if (ftype == F_CMAP)
-             {
-                 v = cmap_dihs(nbn, iatoms+nb0,
-                               idef->iparams, &idef->cmap_grid,
-                               (const rvec*)x, f, fshift,
-                               pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                               md, fcd, global_atom_index);
-             }
++            v = cmap_dihs(nbn, iatoms+nb0,
++                          idef->iparams, &idef->cmap_grid,
++                          (const rvec*)x, f, fshift,
++                          pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
++                          md, fcd, global_atom_index);
++        }
 +#ifdef SIMD_BONDEDS
-             else if (ftype == F_ANGLES &&
-                      !bCalcEnerVir && fr->efep == efepNO)
-             {
-                 /* No energies, shift forces, dvdl */
-                 angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
-                                    idef->iparams,
-                                    (const rvec*)x, f,
-                                    pbc, g, lambda[efptFTYPE], md, fcd,
-                                    global_atom_index);
-                 v = 0;
-             }
++        else if (ftype == F_ANGLES &&
++                 !bCalcEnerVir && fr->efep == efepNO)
++        {
++            /* No energies, shift forces, dvdl */
++            angles_noener_simd(nbn, idef->il[ftype].iatoms+nb0,
++                               idef->iparams,
++                               (const rvec*)x, f,
++                               pbc, g, lambda[efptFTYPE], md, fcd,
++                               global_atom_index);
++            v = 0;
++        }
 +#endif
-             else if (ftype == F_PDIHS &&
-                      !bCalcEnerVir && fr->efep == efepNO)
-             {
-                 /* No energies, shift forces, dvdl */
++        else if (ftype == F_PDIHS &&
++                 !bCalcEnerVir && fr->efep == efepNO)
++        {
++            /* No energies, shift forces, dvdl */
 +#ifndef SIMD_BONDEDS
-                 pdihs_noener
++            pdihs_noener
 +#else
-                 pdihs_noener_simd
++            pdihs_noener_simd
 +#endif
-                     (nbn, idef->il[ftype].iatoms+nb0,
-                     idef->iparams,
-                     (const rvec*)x, f,
-                     pbc, g, lambda[efptFTYPE], md, fcd,
-                     global_atom_index);
-                 v = 0;
-             }
-             else
-             {
-                 v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
-                                                       idef->iparams,
-                                                       (const rvec*)x, f, fshift,
-                                                       pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
-                                                       md, fcd, global_atom_index);
-             }
-             if (bPrintSepPot)
-             {
-                 fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
-                         interaction_function[ftype].longname,
-                         nbonds/nat1, v, lambda[efptFTYPE]);
-             }
++                (nbn, idef->il[ftype].iatoms+nb0,
++                 idef->iparams,
++                 (const rvec*)x, f,
++                 pbc, g, lambda[efptFTYPE], md, fcd,
++                 global_atom_index);
++            v = 0;
 +        }
 +        else
 +        {
-             v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
-                                     pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
- 
-             if (bPrintSepPot)
-             {
-                 fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                         interaction_function[ftype].longname,
-                         interaction_function[F_LJ14].longname, nbonds/nat1, dvdl[efptVDW]);
-                 fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
-                         interaction_function[ftype].longname,
-                         interaction_function[F_COUL14].longname, nbonds/nat1, dvdl[efptCOUL]);
-             }
++            v = interaction_function[ftype].ifunc(nbn, iatoms+nb0,
++                                                  idef->iparams,
++                                                  (const rvec*)x, f, fshift,
++                                                  pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
++                                                  md, fcd, global_atom_index);
 +        }
-         if (ind != -1 && thread == 0)
++        if (bPrintSepPot)
 +        {
-             inc_nrnb(nrnb, ind, nbonds);
++            fprintf(fplog, "  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
++                    interaction_function[ftype].longname,
++                    nbonds, v, lambda[efptFTYPE]);
 +        }
 +    }
- 
-     return v;
- }
- 
- /* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
-    function, or horrible things will happen when doing free energy
-    calculations!  In a good coding world, this would not be a
-    different function, but for speed reasons, it needs to be made a
-    separate function.  TODO for 5.0 - figure out a way to reorganize
-    to reduce duplication.
-  */
- 
- static real calc_one_bond_foreign(FILE gmx_unused *fplog, int ftype, const t_idef *idef,
-                                   rvec x[], rvec f[], t_forcerec *fr,
-                                   const t_pbc *pbc, const t_graph *g,
-                                   gmx_grppairener_t *grpp, t_nrnb *nrnb,
-                                   real *lambda, real *dvdl,
-                                   const t_mdatoms *md, t_fcdata *fcd,
-                                   int *global_atom_index, gmx_bool gmx_unused bPrintSepPot)
- {
-     int      ind, nat1, nbonds, efptFTYPE, nbonds_np;
-     real     v = 0;
-     t_iatom *iatoms;
- 
-     if (IS_RESTRAINT_TYPE(ftype))
-     {
-         efptFTYPE = efptRESTRAINT;
-     }
 +    else
 +    {
-         efptFTYPE = efptBONDED;
++        v = do_nonbonded_listed(ftype, nbn, iatoms+nb0, idef->iparams, (const rvec*)x, f, fshift,
++                                pbc, g, lambda, dvdl, md, fr, grpp, global_atom_index);
++        
++        if (bPrintSepPot)
++        {
++            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
++                    interaction_function[ftype].longname,
++                    interaction_function[F_LJ14].longname, nbonds, dvdl[efptVDW]);
++            fprintf(fplog, "  %-5s + %-15s #%4d                  dVdl %12.5e\n",
++                    interaction_function[ftype].longname,
++                    interaction_function[F_COUL14].longname, nbonds, dvdl[efptCOUL]);
++        }
 +    }
 +
-     if (ftype < F_GB12 || ftype > F_GB14)
++    if (thread == 0)
 +    {
-         if (interaction_function[ftype].flags & IF_BOND &&
-             !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
-         {
-             ind       = interaction_function[ftype].nrnb_ind;
-             nat1      = interaction_function[ftype].nratoms+1;
-             nbonds_np = idef->il[ftype].nr_nonperturbed;
-             nbonds    = idef->il[ftype].nr - nbonds_np;
-             iatoms    = idef->il[ftype].iatoms + nbonds_np;
-             if (nbonds > 0)
-             {
-                 if (!IS_LISTED_LJ_C(ftype))
-                 {
-                     if (ftype == F_CMAP)
-                     {
-                         v = cmap_dihs(nbonds, iatoms,
-                                       idef->iparams, &idef->cmap_grid,
-                                       (const rvec*)x, f, fr->fshift,
-                                       pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]), md, fcd,
-                                       global_atom_index);
-                     }
-                     else
-                     {
-                         v =     interaction_function[ftype].ifunc(nbonds, iatoms,
-                                                                   idef->iparams,
-                                                                   (const rvec*)x, f, fr->fshift,
-                                                                   pbc, g, lambda[efptFTYPE], &dvdl[efptFTYPE],
-                                                                   md, fcd, global_atom_index);
-                     }
-                 }
-                 else
-                 {
-                     v = do_nonbonded_listed(ftype, nbonds, iatoms,
-                                             idef->iparams,
-                                             (const rvec*)x, f, fr->fshift,
-                                             pbc, g, lambda, dvdl,
-                                             md, fr, grpp, global_atom_index);
-                 }
-                 if (ind != -1)
-                 {
-                     inc_nrnb(nrnb, ind, nbonds/nat1);
-                 }
-             }
-         }
++        inc_nrnb(nrnb, interaction_function[ftype].nrnb_ind, nbonds);
 +    }
++
 +    return v;
 +}
 +
 +void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
 +                const t_idef *idef,
 +                rvec x[], history_t *hist,
 +                rvec f[], t_forcerec *fr,
 +                const t_pbc *pbc, const t_graph *g,
 +                gmx_enerdata_t *enerd, t_nrnb *nrnb,
 +                real *lambda,
 +                const t_mdatoms *md,
 +                t_fcdata *fcd, int *global_atom_index,
 +                t_atomtypes gmx_unused *atype, gmx_genborn_t gmx_unused *born,
 +                int force_flags,
 +                gmx_bool bPrintSepPot, gmx_large_int_t step)
 +{
 +    gmx_bool      bCalcEnerVir;
 +    int           i;
 +    real          v, dvdl[efptNR], dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
 +                                                        of lambda, which will be thrown away in the end*/
 +    const  t_pbc *pbc_null;
 +    char          buf[22];
 +    int           thread;
 +
++    assert(fr->nthreads == idef->nthreads);
++
 +    bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
 +
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        dvdl[i] = 0.0;
 +    }
 +    if (fr->bMolPBC)
 +    {
 +        pbc_null = pbc;
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +    if (bPrintSepPot)
 +    {
 +        fprintf(fplog, "Step %s: bonded V and dVdl for this node\n",
 +                gmx_step_str(step, buf));
 +    }
 +
 +#ifdef DEBUG
 +    if (g && debug)
 +    {
 +        p_graph(debug, "Bondage is fun", g);
 +    }
 +#endif
 +
 +    /* Do pre force calculation stuff which might require communication */
 +    if (idef->il[F_ORIRES].nr)
 +    {
 +        enerd->term[F_ORIRESDEV] =
 +            calc_orires_dev(ms, idef->il[F_ORIRES].nr,
 +                            idef->il[F_ORIRES].iatoms,
 +                            idef->iparams, md, (const rvec*)x,
 +                            pbc_null, fcd, hist);
 +    }
 +    if (idef->il[F_DISRES].nr)
 +    {
 +        calc_disres_R_6(ms, idef->il[F_DISRES].nr,
 +                        idef->il[F_DISRES].iatoms,
 +                        idef->iparams, (const rvec*)x, pbc_null,
 +                        fcd, hist);
 +    }
 +
 +#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 +    for (thread = 0; thread < fr->nthreads; thread++)
 +    {
-         int                ftype, nbonds, ind, nat1;
++        int                ftype;
 +        real              *epot, v;
 +        /* thread stuff */
 +        rvec              *ft, *fshift;
 +        real              *dvdlt;
 +        gmx_grppairener_t *grpp;
-         int                nb0, nbn;
 +
 +        if (thread == 0)
 +        {
 +            ft     = f;
 +            fshift = fr->fshift;
 +            epot   = enerd->term;
 +            grpp   = &enerd->grpp;
 +            dvdlt  = dvdl;
 +        }
 +        else
 +        {
 +            zero_thread_forces(&fr->f_t[thread], fr->natoms_force,
 +                               fr->red_nblock, 1<<fr->red_ashift);
 +
 +            ft     = fr->f_t[thread].f;
 +            fshift = fr->f_t[thread].fshift;
 +            epot   = fr->f_t[thread].ener;
 +            grpp   = &fr->f_t[thread].grpp;
 +            dvdlt  = fr->f_t[thread].dvdl;
 +        }
 +        /* Loop over all bonded force types to calculate the bonded forces */
 +        for (ftype = 0; (ftype < F_NRE); ftype++)
 +        {
-             if (idef->il[ftype].nr > 0 &&
-                 (interaction_function[ftype].flags & IF_BOND) &&
-                 (ftype < F_GB12 || ftype > F_GB14) &&
-                 !(ftype == F_CONNBONDS || ftype == F_POSRES))
++            if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
 +            {
 +                v = calc_one_bond(fplog, thread, ftype, idef, x,
-                                   ft, fshift, fr, pbc_null, g, enerd, grpp,
++                                  ft, fshift, fr, pbc_null, g, grpp,
 +                                  nrnb, lambda, dvdlt,
 +                                  md, fcd, bCalcEnerVir,
 +                                  global_atom_index, bPrintSepPot);
-                 epot[ftype]        += v;
++                epot[ftype] += v;
 +            }
 +        }
 +    }
 +    if (fr->nthreads > 1)
 +    {
 +        reduce_thread_forces(fr->natoms_force, f, fr->fshift,
 +                             enerd->term, &enerd->grpp, dvdl,
 +                             fr->nthreads, fr->f_t,
 +                             fr->red_nblock, 1<<fr->red_ashift,
 +                             bCalcEnerVir,
 +                             force_flags & GMX_FORCE_DHDL);
 +    }
 +    if (force_flags & GMX_FORCE_DHDL)
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            enerd->dvdl_nonlin[i] += dvdl[i];
 +        }
 +    }
 +
 +    /* Copy the sum of violations for the distance restraints from fcd */
 +    if (fcd)
 +    {
 +        enerd->term[F_DISRESVIOL] = fcd->disres.sumviol;
 +
 +    }
 +}
 +
 +void calc_bonds_lambda(FILE *fplog,
 +                       const t_idef *idef,
 +                       rvec x[],
 +                       t_forcerec *fr,
 +                       const t_pbc *pbc, const t_graph *g,
 +                       gmx_grppairener_t *grpp, real *epot, t_nrnb *nrnb,
 +                       real *lambda,
 +                       const t_mdatoms *md,
 +                       t_fcdata *fcd,
 +                       int *global_atom_index)
 +{
-     int           i, ftype, nbonds_np, nbonds, ind, nat;
-     real          v, dr, dr2;
++    int           i, ftype, nr_nonperturbed, nr;
++    real          v;
 +    real          dvdl_dum[efptNR];
-     rvec         *f, *fshift_orig;
++    rvec         *f, *fshift;
 +    const  t_pbc *pbc_null;
-     t_iatom      *iatom_fe;
++    t_idef       idef_fe;
 +
 +    if (fr->bMolPBC)
 +    {
 +        pbc_null = pbc;
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +
++    /* Copy the whole idef, so we can modify the contents locally */
++    idef_fe          = *idef;
++    idef_fe.nthreads = 1;
++    snew(idef_fe.il_thread_division, F_NRE*(idef_fe.nthreads+1));
++
++    /* We already have the forces, so we use temp buffers here */
 +    snew(f, fr->natoms_force);
-     /* We want to preserve the fshift array in forcerec */
-     fshift_orig = fr->fshift;
-     snew(fr->fshift, SHIFTS);
++    snew(fshift, SHIFTS);
 +
-     /* Loop over all bonded force types to calculate the bonded forces */
++    /* Loop over all bonded force types to calculate the bonded energies */
 +    for (ftype = 0; (ftype < F_NRE); ftype++)
 +    {
-         v = calc_one_bond_foreign(fplog, ftype, idef, x,
-                                   f, fr, pbc_null, g, grpp, nrnb, lambda, dvdl_dum,
-                                   md, fcd, global_atom_index, FALSE);
-         epot[ftype] += v;
++        if (ftype_is_bonded_potential(ftype))
++        {
++            /* Set the work range of thread 0 to the perturbed bondeds only */
++            nr_nonperturbed                       = idef->il[ftype].nr_nonperturbed;
++            nr                                    = idef->il[ftype].nr;
++            idef_fe.il_thread_division[ftype*2+0] = nr_nonperturbed;
++            idef_fe.il_thread_division[ftype*2+1] = nr;
++
++            /* This is only to get the flop count correct */
++            idef_fe.il[ftype].nr = nr - nr_nonperturbed;
++
++            if (nr - nr_nonperturbed > 0)
++            {
++                v = calc_one_bond(fplog, 0, ftype, &idef_fe,
++                                  x, f, fshift, fr, pbc_null, g,
++                                  grpp, nrnb, lambda, dvdl_dum,
++                                  md, fcd, TRUE,
++                                  global_atom_index, FALSE);
++                epot[ftype] += v;
++            }
++        }
 +    }
 +
-     sfree(fr->fshift);
-     fr->fshift = fshift_orig;
++    sfree(fshift);
 +    sfree(f);
++
++    sfree(idef_fe.il_thread_division);
 +}
diff --cc src/gromacs/legacyheaders/bondf.h
index d70d6c150d,0000000000..6893ccbc83
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/bondf.h
+++ b/src/gromacs/legacyheaders/bondf.h
@@@ -1,168 -1,0 +1,168 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _bondf_h
 +#define _bondf_h
 +
 +
 +#include <stdio.h>
 +#include "typedefs.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "genborn.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +int glatnr(int *global_atom_index, int i);
 +/* Returns the global topology atom number belonging to local atom index i.
 + * This function is intended for writing ascii output
 + * and returns atom numbers starting at 1.
 + * When global_atom_index=NULL returns i+1.
 + */
 +
 +void calc_bonds(FILE *fplog, const gmx_multisim_t *ms,
 +                const t_idef *idef,
 +                rvec x[], history_t *hist,
 +                rvec f[], t_forcerec *fr,
 +                const t_pbc *pbc, const t_graph *g,
 +                gmx_enerdata_t *enerd, t_nrnb *nrnb, real *lambda,
 +                const t_mdatoms *md,
 +                t_fcdata *fcd, int *ddgatindex,
 +                t_atomtypes *atype, gmx_genborn_t *born,
 +                int force_flags,
 +                gmx_bool bPrintSepPot, gmx_large_int_t step);
 +/*
 + * The function calc_bonds() calculates all bonded force interactions.
 + * The "bonds" are specified as follows:
 + *   int nbonds
 + *	    the total number of bonded interactions.
 + *   t_iatom *forceatoms
 + *     specifies which atoms are involved in a bond of a certain
 + *     type, see also struct t_idef.
 + *   t_functype *functype
 + *	    defines for every bonded force type what type of function to
 + *     use, see also struct t_idef.
 + *   t_iparams *forceparams
 + *	    defines the parameters for every bond type, see also struct
 + *     t_idef.
 + *   real epot[NR_F]
 + *     total potential energy split up over the function types.
 + *   int *ddgatindex
 + *     global atom number indices, should be NULL when not using DD.
 + *   gmx_bool bPrintSepPot
 + *     if TRUE print local potential and dVdlambda for each bonded type.
 + *   int step
 + *     used with bPrintSepPot
 + *   return value:
 + *	    the total potential energy (sum over epot).
 + */
 +
 +void calc_bonds_lambda(FILE *fplog,
 +                       const t_idef *idef,
 +                       rvec x[],
 +                       t_forcerec *fr,
 +                       const t_pbc *pbc, const t_graph *g,
 +                       gmx_grppairener_t *grpp, real *epot, t_nrnb *nrnb,
 +                       real *lambda,
 +                       const t_mdatoms *md,
 +                       t_fcdata *fcd, int *global_atom_index);
 +/* As calc_bonds, but only determines the potential energy
 + * for the perturbed interactions.
 + * The shift forces in fr are not affected.
 + */
 +
 +real posres(int nbonds,
 +            const t_iatom forceatoms[], const t_iparams forceparams[],
 +            const rvec x[], rvec f[], rvec vir_diag,
 +            t_pbc *pbc,
 +            real lambda, real *dvdlambda,
 +            int refcoord_scaling, int ePBC, rvec comA, rvec comB);
 +/* Position restraints require a different pbc treatment from other bondeds */
 +
 +real fbposres(int nbonds,
 +              const t_iatom forceatoms[], const t_iparams forceparams[],
 +              const rvec x[], rvec f[], rvec vir_diag,
 +              t_pbc *pbc, int refcoord_scaling, int ePBC, rvec com);
 +/* Flat-bottom posres. Same PBC treatment as in normal position restraints */
 +
 +real bond_angle(const rvec xi, const rvec xj, const rvec xk,
 +                const t_pbc *pbc,
 +                rvec r_ij, rvec r_kj, real *costh,
 +                int *t1, int *t2);  /* out */
 +/* Calculate bond-angle. No PBC is taken into account (use mol-shift) */
 +
 +real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
 +               const t_pbc *pbc,
 +               rvec r_ij, rvec r_kj, rvec r_kl, rvec m, rvec n, /* out */
 +               real *sign,
 +               int *t1, int *t2, int *t3);
 +/* Calculate dihedral-angle. No PBC is taken into account (use mol-shift) */
 +
 +void do_dih_fup(int i, int j, int k, int l, real ddphi,
 +                rvec r_ij, rvec r_kj, rvec r_kl,
 +                rvec m, rvec n, rvec f[], rvec fshift[],
 +                const t_pbc *pbc, const t_graph *g,
 +                const rvec *x, int t1, int t2, int t3);
 +/* Do an update of the forces for dihedral potentials */
 +
 +void make_dp_periodic(real *dp);
 +/* make a dihedral fall in the range (-pi,pi) */
 +
 +/*************************************************************************
 + *
 + *  Bonded force functions
 + *
 + *************************************************************************/
 +t_ifunc bonds, g96bonds, morse_bonds, cubic_bonds, FENE_bonds, restraint_bonds;
 +t_ifunc angles, g96angles, cross_bond_bond, cross_bond_angle, urey_bradley, quartic_angles, linear_angles;
 +t_ifunc pdihs, idihs, rbdihs;
 +t_ifunc tab_bonds, tab_angles, tab_dihs;
 +t_ifunc polarize, anharm_polarize, water_pol, thole_pol, angres, angresz, dihres, unimplemented;
 +
 +
- /* Initialize the setup for the bonded force buffer reduction
-  * over threads. This should be called each time the bonded setup
-  * changes; i.e. at start-up without domain decomposition and at DD.
++/* Divided the bonded interactions over the threads, count=fr->nthreads
++ * and set up the bonded thread-force buffer reduction.
++ * This should be called each time the bonded setup changes;
++ * i.e. at start-up without domain decomposition and at DD.
 + */
- void init_bonded_thread_force_reduction(t_forcerec   *fr,
-                                         const t_idef *idef);
++void setup_bonded_threading(t_forcerec *fr, t_idef *idef);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif  /* _bondf_h */
diff --cc src/gromacs/legacyheaders/gmx_simd_macros.h
index 37e94880ef,0000000000..f403f13963
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/gmx_simd_macros.h
+++ b/src/gromacs/legacyheaders/gmx_simd_macros.h
@@@ -1,536 -1,0 +1,538 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/* The macros in this file are intended to be used for writing
 + * architecture-independent SIMD intrinsics code.
 + * To support a new architecture, adding macros here should be (nearly)
 + * all that is needed.
 + */
 +
 +#ifdef _gmx_simd_macros_h_
 +#error "gmx_simd_macros.h included twice"
 +#else
 +#define _gmx_simd_macros_h_
 +
 +/* NOTE: SSE2 acceleration does not include floor or blendv */
 +
 +
 +/* Uncomment the next line, without other SIMD active, for testing plain-C */
 +/* #define GMX_SIMD_REFERENCE_PLAIN_C */
 +#ifdef GMX_SIMD_REFERENCE_PLAIN_C
 +/* Plain C SIMD reference implementation, also serves as documentation */
 +#define GMX_HAVE_SIMD_MACROS
 +
 +/* In general the reference SIMD supports any SIMD width, including 1.
 + * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
 + * The nbnxn 2xnn kernels are currently not supported.
 + */
 +#define GMX_SIMD_REF_WIDTH  4
 +
 +/* Include plain-C reference implementation, also serves as documentation */
 +#include "gmx_simd_ref.h"
 +
 +#define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
 +
 +/* float/double SIMD register type */
 +#define gmx_mm_pr  gmx_simd_ref_pr
 +
 +/* boolean SIMD register type */
 +#define gmx_mm_pb  gmx_simd_ref_pb
 +
 +/* integer SIMD register type, only for table indexing and exclusion masks */
 +#define gmx_epi32  gmx_simd_ref_epi32
 +#define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
 +
 +/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
 +#define gmx_load_pr       gmx_simd_ref_load_pr
 +/* Set all SIMD register elements to *r */
 +#define gmx_load1_pr      gmx_simd_ref_load1_pr
 +#define gmx_set1_pr       gmx_simd_ref_set1_pr
 +#define gmx_setzero_pr    gmx_simd_ref_setzero_pr
 +#define gmx_store_pr      gmx_simd_ref_store_pr
 +
 +#define gmx_add_pr        gmx_simd_ref_add_pr
 +#define gmx_sub_pr        gmx_simd_ref_sub_pr
 +#define gmx_mul_pr        gmx_simd_ref_mul_pr
 +/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
 +#define gmx_madd_pr       gmx_simd_ref_madd_pr
 +#define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
 +
 +#define gmx_max_pr        gmx_simd_ref_max_pr
 +#define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
 +
 +#define gmx_round_pr      gmx_simd_ref_round_pr
 +
 +/* Not required, only used to speed up the nbnxn tabulated PME kernels */
 +#define GMX_SIMD_HAVE_FLOOR
 +#ifdef GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      gmx_simd_ref_floor_pr
 +#endif
 +
 +/* Not required, only used when blendv is faster than comparison */
 +#define GMX_SIMD_HAVE_BLENDV
 +#ifdef GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 +#endif
 +
 +/* Copy the sign of a to b, assumes b >= 0 for efficiency */
 +#define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 +
 +/* Very specific operation required in the non-bonded kernels */
 +#define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 +
 +/* Comparison */
 +#define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 +
 +/* Logical operations on SIMD booleans */
 +#define gmx_and_pb        gmx_simd_ref_and_pb
 +#define gmx_or_pb         gmx_simd_ref_or_pb
 +
 +/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
 + * If this is not present, define GMX_SIMD_IS_TRUE(real x),
 + * which should return x==True, where True is True as defined in SIMD.
 + */
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#ifdef GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 +#else
 +/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
 +#define gmx_store_pb      gmx_simd_ref_store_pb
 +#endif
 +
 +/* Conversions only used for PME table lookup */
 +#define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 +#define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 +
 +/* These two function only need to be approximate, Newton-Raphson iteration
 + * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 + */
 +#define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 +#define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 +
 +/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 +#define GMX_SIMD_HAVE_EXP
 +#ifdef GMX_SIMD_HAVE_EXP
 +#define gmx_exp_pr        gmx_simd_ref_exp_pr
 +#endif
 +#define GMX_SIMD_HAVE_TRIGONOMETRIC
 +#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 +#define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 +#define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 +#define gmx_acos_pr       gmx_simd_ref_acos_pr
 +#define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 +#endif
 +
 +#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 +
 +
 +/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 + * to instructions for) different SIMD width and float precision.
 + *
 + * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 + * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 + * Compiler settings will decide if 128-bit intrinsics will
 + * be translated into SSE or AVX instructions.
 + */
 +
 +
 +#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 +#if defined GMX_X86_AVX_256
 +/* We have half SIMD width support, continue */
 +#else
 +#error "half SIMD width intrinsics are not supported"
 +#endif
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +/* This is for general x86 SIMD instruction sets that also support SSE2 */
 +#define GMX_HAVE_SIMD_MACROS
 +
 +/* Include the highest supported x86 SIMD intrisics + math functions */
 +#ifdef GMX_X86_AVX_256
 +#include "gmx_x86_avx_256.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_avx_256_double.h"
 +#else
 +#include "gmx_math_x86_avx_256_single.h"
 +#endif
 +#else
 +#ifdef GMX_X86_AVX_128_FMA
 +#include "gmx_x86_avx_128_fma.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_avx_128_fma_double.h"
 +#else
 +#include "gmx_math_x86_avx_128_fma_single.h"
 +#endif
 +#else
 +#ifdef GMX_X86_SSE4_1
 +#include "gmx_x86_sse4_1.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_sse4_1_double.h"
 +#else
 +#include "gmx_math_x86_sse4_1_single.h"
 +#endif
 +#else
 +#ifdef GMX_X86_SSE2
 +#include "gmx_x86_sse2.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_sse2_double.h"
 +#else
 +#include "gmx_math_x86_sse2_single.h"
 +#endif
 +#else
 +#error No x86 acceleration defined
 +#endif
 +#endif
 +#endif
 +#endif
 +/* exp and trigonometric functions are included above */
 +#define GMX_SIMD_HAVE_EXP
 +#define GMX_SIMD_HAVE_TRIGONOMETRIC
 +
 +#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 +
 +#ifndef GMX_DOUBLE
 +
 +#define GMX_SIMD_WIDTH_HERE  4
 +
 +#define gmx_mm_pr  __m128
 +
 +#define gmx_mm_pb  __m128
 +
 +#define gmx_epi32  __m128i
 +#define GMX_SIMD_EPI32_WIDTH  4
 +
 +#define gmx_load_pr       _mm_load_ps
 +#define gmx_load1_pr      _mm_load1_ps
 +#define gmx_set1_pr       _mm_set1_ps
 +#define gmx_setzero_pr    _mm_setzero_ps
 +#define gmx_store_pr      _mm_store_ps
 +
 +#define gmx_add_pr        _mm_add_ps
 +#define gmx_sub_pr        _mm_sub_ps
 +#define gmx_mul_pr        _mm_mul_ps
 +#ifdef GMX_X86_AVX_128_FMA
++#define GMX_SIMD_HAVE_FMA
 +#define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 +#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 +#else
 +#define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 +#endif
 +#define gmx_max_pr        _mm_max_ps
 +#define gmx_blendzero_pr  _mm_and_ps
 +
 +#define gmx_cmplt_pr      _mm_cmplt_ps
 +#define gmx_and_pb        _mm_and_ps
 +#define gmx_or_pb         _mm_or_ps
 +
 +#ifdef GMX_X86_SSE4_1
 +#define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm_floor_ps
 +#else
 +#define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 +#endif
 +
 +#ifdef GMX_X86_SSE4_1
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm_blendv_ps
 +#endif
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    /* The value -0.0 has only the sign-bit set */
 +    gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 +    return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm_movemask_ps
 +
 +#define gmx_cvttpr_epi32  _mm_cvttps_epi32
 +#define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 +
 +#define gmx_rsqrt_pr      _mm_rsqrt_ps
 +#define gmx_rcp_pr        _mm_rcp_ps
 +
 +#define gmx_exp_pr        gmx_mm_exp_ps
 +#define gmx_sqrt_pr       gmx_mm_sqrt_ps
 +#define gmx_sincos_pr     gmx_mm_sincos_ps
 +#define gmx_acos_pr       gmx_mm_acos_ps
 +#define gmx_atan2_pr      gmx_mm_atan2_ps
 +
 +#else /* ifndef GMX_DOUBLE */
 +
 +#define GMX_SIMD_WIDTH_HERE  2
 +
 +#define gmx_mm_pr  __m128d
 +
 +#define gmx_mm_pb  __m128d
 +
 +#define gmx_epi32  __m128i
 +#define GMX_SIMD_EPI32_WIDTH  4
 +
 +#define gmx_load_pr       _mm_load_pd
 +#define gmx_load1_pr      _mm_load1_pd
 +#define gmx_set1_pr       _mm_set1_pd
 +#define gmx_setzero_pr    _mm_setzero_pd
 +#define gmx_store_pr      _mm_store_pd
 +
 +#define gmx_add_pr        _mm_add_pd
 +#define gmx_sub_pr        _mm_sub_pd
 +#define gmx_mul_pr        _mm_mul_pd
 +#ifdef GMX_X86_AVX_128_FMA
++#define GMX_SIMD_HAVE_FMA
 +#define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 +#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 +#else
 +#define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 +#endif
 +#define gmx_max_pr        _mm_max_pd
 +#define gmx_blendzero_pr  _mm_and_pd
 +
 +#ifdef GMX_X86_SSE4_1
 +#define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm_floor_pd
 +#else
 +#define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 +/* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 +#endif
 +
 +#ifdef GMX_X86_SSE4_1
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm_blendv_pd
 +#endif
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 +    return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 +
 +#define gmx_cmplt_pr      _mm_cmplt_pd
 +
 +#define gmx_and_pb        _mm_and_pd
 +#define gmx_or_pb         _mm_or_pd
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm_movemask_pd
 +
 +#define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 +#define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 +
 +#define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 +#define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 +
 +#define gmx_exp_pr        gmx_mm_exp_pd
 +#define gmx_sqrt_pr       gmx_mm_sqrt_pd
 +#define gmx_sincos_pr     gmx_mm_sincos_pd
 +#define gmx_acos_pr       gmx_mm_acos_pd
 +#define gmx_atan2_pr      gmx_mm_atan2_pd
 +
 +#endif /* ifndef GMX_DOUBLE */
 +
 +#else
 +/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 + * so we use 256-bit SIMD.
 + */
 +
 +#ifndef GMX_DOUBLE
 +
 +#define GMX_SIMD_WIDTH_HERE  8
 +
 +#define gmx_mm_pr  __m256
 +
 +#define gmx_mm_pb  __m256
 +
 +#define gmx_epi32  __m256i
 +#define GMX_SIMD_EPI32_WIDTH  8
 +
 +#define gmx_load_pr       _mm256_load_ps
 +#define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 +#define gmx_set1_pr       _mm256_set1_ps
 +#define gmx_setzero_pr    _mm256_setzero_ps
 +#define gmx_store_pr      _mm256_store_ps
 +
 +#define gmx_add_pr        _mm256_add_ps
 +#define gmx_sub_pr        _mm256_sub_ps
 +#define gmx_mul_pr        _mm256_mul_ps
 +#define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 +#define gmx_max_pr        _mm256_max_ps
 +#define gmx_blendzero_pr  _mm256_and_ps
 +
 +#define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm256_floor_ps
 +
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm256_blendv_ps
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 +    return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 +
 +/* Less-than (we use ordered, non-signaling, but that's not required) */
 +#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 +#define gmx_and_pb        _mm256_and_ps
 +#define gmx_or_pb         _mm256_or_ps
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm256_movemask_ps
 +
 +#define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 +
 +#define gmx_rsqrt_pr      _mm256_rsqrt_ps
 +#define gmx_rcp_pr        _mm256_rcp_ps
 +
 +#define gmx_exp_pr        gmx_mm256_exp_ps
 +#define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 +#define gmx_sincos_pr     gmx_mm256_sincos_ps
 +#define gmx_acos_pr       gmx_mm256_acos_ps
 +#define gmx_atan2_pr      gmx_mm256_atan2_ps
 +
 +#else /* ifndef GMX_DOUBLE */
 +
 +#define GMX_SIMD_WIDTH_HERE  4
 +
 +#define gmx_mm_pr  __m256d
 +
 +#define gmx_mm_pb  __m256d
 +
 +/* We use 128-bit integer registers because of missing 256-bit operations */
 +#define gmx_epi32  __m128i
 +#define GMX_SIMD_EPI32_WIDTH  4
 +
 +#define gmx_load_pr       _mm256_load_pd
 +#define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 +#define gmx_set1_pr       _mm256_set1_pd
 +#define gmx_setzero_pr    _mm256_setzero_pd
 +#define gmx_store_pr      _mm256_store_pd
 +
 +#define gmx_add_pr        _mm256_add_pd
 +#define gmx_sub_pr        _mm256_sub_pd
 +#define gmx_mul_pr        _mm256_mul_pd
 +#define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 +#define gmx_max_pr        _mm256_max_pd
 +#define gmx_blendzero_pr  _mm256_and_pd
 +
 +#define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm256_floor_pd
 +
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm256_blendv_pd
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 +    return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 +
 +/* Less-than (we use ordered, non-signaling, but that's not required) */
 +#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 +
 +#define gmx_and_pb        _mm256_and_pd
 +#define gmx_or_pb         _mm256_or_pd
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm256_movemask_pd
 +
 +#define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 +
 +#define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 +#define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 +
 +#define gmx_exp_pr        gmx_mm256_exp_pd
 +#define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 +#define gmx_sincos_pr     gmx_mm256_sincos_pd
 +#define gmx_acos_pr       gmx_mm256_acos_pd
 +#define gmx_atan2_pr      gmx_mm256_atan2_pd
 +
 +#endif /* ifndef GMX_DOUBLE */
 +
 +#endif /* 128- or 256-bit x86 SIMD */
 +
 +#endif /* GMX_X86_SSE2 */
 +
 +
 +#ifdef GMX_HAVE_SIMD_MACROS
 +/* Generic functions to extract a SIMD aligned pointer from a pointer x.
 + * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 + * to how many you want to use, to avoid indexing outside the aligned region.
 + */
 +
 +static gmx_inline real *
 +gmx_simd_align_real(const real *x)
 +{
 +    return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 +}
 +
 +static gmx_inline int *
 +gmx_simd_align_int(const int *x)
 +{
 +    return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 +}
 +
 +
 +/* Include the math functions which only need the above macros,
 + * generally these are the ones that don't need masking operations.
 + */
 +#ifdef GMX_DOUBLE
 +#include "gmx_simd_math_double.h"
 +#else
 +#include "gmx_simd_math_single.h"
 +#endif
 +
 +#endif /* GMX_HAVE_SIMD_MACROS */
 +
 +#endif /* _gmx_simd_macros_h_ */
diff --cc src/gromacs/legacyheaders/pme.h
index 42b8428dd1,0000000000..e405926981
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/pme.h
+++ b/src/gromacs/legacyheaders/pme.h
@@@ -1,198 -1,0 +1,200 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _pme_h
 +#define _pme_h
 +
 +#include <stdio.h>
 +#include "typedefs.h"
 +#include "gmxcomplex.h"
 +#include "gmx_wallcycle.h"
++#include "sim_util.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +typedef real *splinevec[DIM];
 +
 +enum {
 +    GMX_SUM_QGRID_FORWARD, GMX_SUM_QGRID_BACKWARD
 +};
 +
 +int gmx_pme_init(gmx_pme_t *pmedata, t_commrec *cr,
 +                 int nnodes_major, int nnodes_minor,
 +                 t_inputrec *ir, int homenr,
 +                 gmx_bool bFreeEnergy, gmx_bool bReproducible, int nthread);
 +/* Initialize the pme data structures resepectively.
 + * Return value 0 indicates all well, non zero is an error code.
 + */
 +
 +int gmx_pme_reinit(gmx_pme_t *         pmedata,
 +                   t_commrec *         cr,
 +                   gmx_pme_t           pme_src,
 +                   const t_inputrec *  ir,
 +                   ivec                grid_size);
 +/* As gmx_pme_init, but takes most settings, except the grid, from pme_src */
 +
 +int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata);
 +/* Destroy the pme data structures resepectively.
 + * Return value 0 indicates all well, non zero is an error code.
 + */
 +
 +#define GMX_PME_SPREAD_Q      (1<<0)
 +#define GMX_PME_SOLVE         (1<<1)
 +#define GMX_PME_CALC_F        (1<<2)
 +#define GMX_PME_CALC_ENER_VIR (1<<3)
 +/* This forces the grid to be backtransformed even without GMX_PME_CALC_F */
 +#define GMX_PME_CALC_POT      (1<<4)
 +#define GMX_PME_DO_ALL_F  (GMX_PME_SPREAD_Q | GMX_PME_SOLVE | GMX_PME_CALC_F)
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real chargeA[],  real chargeB[],
 +               matrix box,      t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix lrvir,    real ewaldcoeff,
 +               real *energy,    real lambda,
 +               real *dvdlambda, int flags);
 +/* Do a PME calculation for the long range electrostatics.
 + * flags, defined above, determine which parts of the calculation are performed.
 + * Return value 0 indicates all well, non zero is an error code.
 + */
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,     t_nrnb *mynrnb,
 +                gmx_wallcycle_t wcycle,
++                gmx_runtime_t *runtime,
 +                real ewaldcoeff,
 +                t_inputrec *ir);
 +/* Called on the nodes that do PME exclusively (as slaves)
 + */
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V);
 +/* Calculate the PME grid energy V for n charges with a potential
 + * in the pme struct determined before with a call to gmx_pme_do
 + * with at least GMX_PME_SPREAD_Q and GMX_PME_SOLVE specified.
 + * Note that the charges are not spread on the grid in the pme struct.
 + * Currently does not work in parallel or with free energy.
 + */
 +
 +/* The following three routines are for PME/PP node splitting in pme_pp.c */
 +
 +/* Abstract type for PME <-> PP communication */
 +typedef struct gmx_pme_pp *gmx_pme_pp_t;
 +
 +void gmx_pme_check_restrictions(int pme_order,
 +                                int nkx, int nky, int nkz,
 +                                int nnodes_major,
 +                                int nnodes_minor,
 +                                gmx_bool bUseThreads,
 +                                gmx_bool bFatal,
 +                                gmx_bool *bValidSettings);
 +/* Check restrictions on pme_order and the PME grid nkx,nky,nkz.
 + * With bFatal=TRUE, a fatal error is generated on violation,
 + * bValidSettings=NULL can be passed.
 + * With bFatal=FALSE, *bValidSettings reports the validity of the settings.
 + * bUseThreads tells if any MPI rank doing PME uses more than 1 threads.
 + * If at calling you bUseThreads is unknown, pass TRUE for conservative
 + * checking.
 + */
 +
 +gmx_pme_pp_t gmx_pme_pp_init(t_commrec *cr);
 +/* Initialize the PME-only side of the PME <-> PP communication */
 +
 +void gmx_pme_send_q(t_commrec *cr,
 +                    gmx_bool bFreeEnergy, real *chargeA, real *chargeB,
 +                    int maxshift_x, int maxshift_y);
 +/* Send the charges and maxshift to out PME-only node. */
 +
 +void gmx_pme_send_x(t_commrec *cr, matrix box, rvec *x,
 +                    gmx_bool bFreeEnergy, real lambda,
 +                    gmx_bool bEnerVir,
 +                    gmx_large_int_t step);
 +/* Send the coordinates to our PME-only node and request a PME calculation */
 +
 +void gmx_pme_send_finish(t_commrec *cr);
 +/* Tell our PME-only node to finish */
 +
 +void gmx_pme_send_switchgrid(t_commrec *cr, ivec grid_size, real ewaldcoeff);
 +/* Tell our PME-only node to switch to a new grid size */
 +
 +void gmx_pme_send_resetcounters(t_commrec *cr, gmx_large_int_t step);
 +/* Tell our PME-only node to reset all cycle and flop counters */
 +
 +void gmx_pme_receive_f(t_commrec *cr,
 +                       rvec f[], matrix vir,
 +                       real *energy, real *dvdlambda,
 +                       float *pme_cycles);
 +/* PP nodes receive the long range forces from the PME nodes */
 +
 +/* Return values for gmx_pme_recv_q_x */
 +enum {
 +    pmerecvqxX,            /* calculate PME mesh interactions for new x    */
 +    pmerecvqxFINISH,       /* the simulation should finish, we should quit */
 +    pmerecvqxSWITCHGRID,   /* change the PME grid size                     */
 +    pmerecvqxRESETCOUNTERS /* reset the cycle and flop counters            */
 +};
 +
 +int gmx_pme_recv_q_x(gmx_pme_pp_t pme_pp,
 +                     int *natoms,
 +                     real **chargeA, real **chargeB,
 +                     matrix box, rvec **x, rvec **f,
 +                     int *maxshift_x, int *maxshift_y,
 +                     gmx_bool *bFreeEnergy, real *lambda,
 +                     gmx_bool *bEnerVir,
 +                     gmx_large_int_t *step,
 +                     ivec grid_size, real *ewaldcoeff);
 +;
 +/* With return value:
 + * pmerecvqxX:             all parameters set, chargeA and chargeB can be NULL
 + * pmerecvqxFINISH:        no parameters set
 + * pmerecvqxSWITCHGRID:    only grid_size and *ewaldcoeff are set
 + * pmerecvqxRESETCOUNTERS: *step is set
 + */
 +
 +void gmx_pme_send_force_vir_ener(gmx_pme_pp_t pme_pp,
 +                                 rvec *f, matrix vir,
 +                                 real energy, real dvdlambda,
 +                                 float cycles);
 +/* Send the PME mesh force, virial and energy to the PP-only nodes */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
diff --cc src/gromacs/legacyheaders/types/idef.h
index e5fb5c73da,0000000000..c8011a3be5
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/types/idef.h
+++ b/src/gromacs/legacyheaders/types/idef.h
@@@ -1,399 -1,0 +1,413 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +
 +
 +#ifndef _idef_h
 +#define _idef_h
 +
 +#include "simple.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +/* check kernel/toppush.c when you change these numbers */
 +#define MAXATOMLIST 6
 +#define MAXFORCEPARAM   12
 +#define NR_RBDIHS   6
 +#define NR_FOURDIHS     4
 +
 +typedef atom_id t_iatom;
 +
 +/* this MUST correspond to the
 +   t_interaction_function[F_NRE] in gmxlib/ifunc.c */
 +enum {
 +    F_BONDS,
 +    F_G96BONDS,
 +    F_MORSE,
 +    F_CUBICBONDS,
 +    F_CONNBONDS,
 +    F_HARMONIC,
 +    F_FENEBONDS,
 +    F_TABBONDS,
 +    F_TABBONDSNC,
 +    F_RESTRBONDS,
 +    F_ANGLES,
 +    F_G96ANGLES,
 +    F_LINEAR_ANGLES,
 +    F_CROSS_BOND_BONDS,
 +    F_CROSS_BOND_ANGLES,
 +    F_UREY_BRADLEY,
 +    F_QUARTIC_ANGLES,
 +    F_TABANGLES,
 +    F_PDIHS,
 +    F_RBDIHS,
 +    F_FOURDIHS,
 +    F_IDIHS,
 +    F_PIDIHS,
 +    F_TABDIHS,
 +    F_CMAP,
 +    F_GB12,
 +    F_GB13,
 +    F_GB14,
 +    F_GBPOL,
 +    F_NPSOLVATION,
 +    F_LJ14,
 +    F_COUL14,
 +    F_LJC14_Q,
 +    F_LJC_PAIRS_NB,
 +    F_LJ,
 +    F_BHAM,
 +    F_LJ_LR,
 +    F_BHAM_LR,
 +    F_DISPCORR,
 +    F_COUL_SR,
 +    F_COUL_LR,
 +    F_RF_EXCL,
 +    F_COUL_RECIP,
 +    F_DPD,
 +    F_POLARIZATION,
 +    F_WATER_POL,
 +    F_THOLE_POL,
 +    F_ANHARM_POL,
 +    F_POSRES,
 +    F_FBPOSRES,
 +    F_DISRES,
 +    F_DISRESVIOL,
 +    F_ORIRES,
 +    F_ORIRESDEV,
 +    F_ANGRES,
 +    F_ANGRESZ,
 +    F_DIHRES,
 +    F_DIHRESVIOL,
 +    F_CONSTR,
 +    F_CONSTRNC,
 +    F_SETTLE,
 +    F_VSITE2,
 +    F_VSITE3,
 +    F_VSITE3FD,
 +    F_VSITE3FAD,
 +    F_VSITE3OUT,
 +    F_VSITE4FD,
 +    F_VSITE4FDN,
 +    F_VSITEN,
 +    F_COM_PULL,
 +    F_EQM,
 +    F_EPOT,
 +    F_EKIN,
 +    F_ETOT,
 +    F_ECONSERVED,
 +    F_TEMP,
 +    F_VTEMP_NOLONGERUSED,
 +    F_PDISPCORR,
 +    F_PRES,
 +    F_DVDL_CONSTR,
 +    F_DVDL,
 +    F_DKDL,
 +    F_DVDL_COUL,
 +    F_DVDL_VDW,
 +    F_DVDL_BONDED,
 +    F_DVDL_RESTRAINT,
 +    F_DVDL_TEMPERATURE, /* not calculated for now, but should just be the energy (NVT) or enthalpy (NPT), or 0 (NVE) */
 +    F_NRE               /* This number is for the total number of energies	*/
 +};
 +
 +#define IS_RESTRAINT_TYPE(ifunc) (((ifunc == F_POSRES) || (ifunc == F_DISRES) || (ifunc == F_RESTRBONDS) || (ifunc == F_DISRESVIOL) || (ifunc == F_ORIRES) || (ifunc == F_ORIRESDEV) || (ifunc == F_ANGRES) || (ifunc == F_ANGRESZ) || (ifunc == F_DIHRES)))
 +
 +/* A macro for checking if ftype is an explicit pair-listed LJ or COULOMB
 + * interaction type:
 + * bonded LJ (usually 1-4), or special listed non-bonded for FEP.
 + */
 +#define IS_LISTED_LJ_C(ftype) ((ftype) >= F_LJ14 && (ftype) <= F_LJC_PAIRS_NB)
 +
 +typedef union
 +{
 +    /* Some parameters have A and B values for free energy calculations.
 +     * The B values are not used for regular simulations of course.
 +     * Free Energy for nonbondeds can be computed by changing the atom type.
 +     * The harmonic type is used for all harmonic potentials:
 +     * bonds, angles and improper dihedrals
 +     */
 +    struct {
 +        real a, b, c;
 +    } bham;
 +    struct {
 +        real rA, krA, rB, krB;
 +    } harmonic;
 +    struct {
 +        real klinA, aA, klinB, aB;
 +    } linangle;
 +    struct {
 +        real lowA, up1A, up2A, kA, lowB, up1B, up2B, kB;
 +    } restraint;
 +    /* No free energy supported for cubic bonds, FENE, WPOL or cross terms */
 +    struct {
 +        real b0, kb, kcub;
 +    } cubic;
 +    struct {
 +        real bm, kb;
 +    } fene;
 +    struct {
 +        real r1e, r2e, krr;
 +    } cross_bb;
 +    struct {
 +        real r1e, r2e, r3e, krt;
 +    } cross_ba;
 +    struct {
 +        real thetaA, kthetaA, r13A, kUBA, thetaB, kthetaB, r13B, kUBB;
 +    } u_b;
 +    struct {
 +        real theta, c[5];
 +    } qangle;
 +    struct {
 +        real alpha;
 +    } polarize;
 +    struct {
 +        real alpha, drcut, khyp;
 +    } anharm_polarize;
 +    struct {
 +        real al_x, al_y, al_z, rOH, rHH, rOD;
 +    } wpol;
 +    struct {
 +        real a, alpha1, alpha2, rfac;
 +    } thole;
 +    struct {
 +        real c6, c12;
 +    } lj;
 +    struct {
 +        real c6A, c12A, c6B, c12B;
 +    } lj14;
 +    struct {
 +        real fqq, qi, qj, c6, c12;
 +    } ljc14;
 +    struct {
 +        real qi, qj, c6, c12;
 +    } ljcnb;
 +    /* Proper dihedrals can not have different multiplicity when
 +     * doing free energy calculations, because the potential would not
 +     * be periodic anymore.
 +     */
 +    struct {
 +        real phiA, cpA; int mult; real phiB, cpB;
 +    } pdihs;
 +    struct {
 +        real dA, dB;
 +    } constr;
 +    /* Settle can not be used for Free energy calculations of water bond geometry.
 +     * Use shake (or lincs) instead if you have to change the water bonds.
 +     */
 +    struct {
 +        real doh, dhh;
 +    } settle;
 +    struct {
 +        real b0A, cbA, betaA, b0B, cbB, betaB;
 +    } morse;
 +    struct {
 +        real pos0A[DIM], fcA[DIM], pos0B[DIM], fcB[DIM];
 +    } posres;
 +    struct {
 +        real pos0[DIM], r, k; int geom;
 +    } fbposres;
 +    struct {
 +        real rbcA[NR_RBDIHS], rbcB[NR_RBDIHS];
 +    } rbdihs;
 +    struct {
 +        real a, b, c, d, e, f;
 +    } vsite;
 +    struct {
 +        int  n; real a;
 +    } vsiten;
 +    /* NOTE: npair is only set after reading the tpx file */
 +    struct {
 +        real low, up1, up2, kfac; int type, label, npair;
 +    } disres;
 +    struct {
 +        real phiA, dphiA, kfacA, phiB, dphiB, kfacB;
 +    } dihres;
 +    struct {
 +        int  ex, power, label; real c, obs, kfac;
 +    } orires;
 +    struct {
 +        int  table; real kA; real kB;
 +    } tab;
 +    struct {
 +        real sar, st, pi, gbr, bmlt;
 +    } gb;
 +    struct {
 +        int cmapA, cmapB;
 +    } cmap;
 +    struct {
 +        real buf[MAXFORCEPARAM];
 +    } generic;                                               /* Conversion */
 +} t_iparams;
 +
 +typedef int t_functype;
 +
 +/*
 + * The nonperturbed/perturbed interactions are now separated (sorted) in the
 + * ilist, such that the first 0..(nr_nonperturbed-1) ones are exactly that, and
 + * the remaining ones from nr_nonperturbed..(nr-1) are perturbed bonded
 + * interactions.
 + */
 +typedef struct
 +{
 +    int      nr;
 +    int      nr_nonperturbed;
 +    t_iatom *iatoms;
 +    int      nalloc;
 +} t_ilist;
 +
 +/*
 + * The struct t_ilist defines a list of atoms with their interactions.
 + * General field description:
 + *   int nr
 + *	the size (nr elements) of the interactions array (iatoms[]).
 + *   t_iatom *iatoms
 + *  specifies which atoms are involved in an interaction of a certain
 + *       type. The layout of this array is as follows:
 + *
 + *	  +-----+---+---+---+-----+---+---+-----+---+---+---+-----+---+---+...
 + *	  |type1|at1|at2|at3|type2|at1|at2|type1|at1|at2|at3|type3|at1|at2|
 + *	  +-----+---+---+---+-----+---+---+-----+---+---+---+-----+---+---+...
 + *
 + *  So for interaction type type1 3 atoms are needed, and for type2 and
 + *      type3 only 2. The type identifier is used to select the function to
 + *	calculate the interaction and its actual parameters. This type
 + *	identifier is an index in a params[] and functype[] array.
 + */
 +
 +typedef struct
 +{
 +    real *cmap; /* Has length 4*grid_spacing*grid_spacing, */
 +    /* there are 4 entries for each cmap type (V,dVdx,dVdy,d2dVdxdy) */
 +} cmapdata_t;
 +
 +typedef struct
 +{
 +    int         ngrid;        /* Number of allocated cmap (cmapdata_t ) grids */
 +    int         grid_spacing; /* Grid spacing */
 +    cmapdata_t *cmapdata;     /* Pointer to grid with actual, pre-interpolated data */
 +} gmx_cmap_t;
 +
 +
 +typedef struct
 +{
 +    int         ntypes;
 +    int         atnr;
 +    t_functype *functype;
 +    t_iparams  *iparams;
 +    double      reppow;    /* The repulsion power for VdW: C12*r^-reppow   */
 +    real        fudgeQQ;   /* The scaling factor for Coulomb 1-4: f*q1*q2  */
 +    gmx_cmap_t  cmap_grid; /* The dihedral correction maps                 */
 +} gmx_ffparams_t;
 +
 +enum {
 +    ilsortUNKNOWN, ilsortNO_FE, ilsortFE_UNSORTED, ilsortFE_SORTED
 +};
 +
 +typedef struct
 +{
 +    int         ntypes;
 +    int         atnr;
 +    t_functype *functype;
 +    t_iparams  *iparams;
 +    real        fudgeQQ;
 +    gmx_cmap_t  cmap_grid;
 +    t_iparams  *iparams_posres, *iparams_fbposres;
 +    int         iparams_posres_nalloc, iparams_fbposres_nalloc;
 +
 +    t_ilist     il[F_NRE];
 +    int         ilsort;
++    int         nthreads;
++    int        *il_thread_division;
++    int         il_thread_division_nalloc;
 +} t_idef;
 +
 +/*
 + * The struct t_idef defines all the interactions for the complete
 + * simulation. The structure is setup in such a way that the multinode
 + * version of the program  can use it as easy as the single node version.
 + * General field description:
 + *   int ntypes
 + *	defines the number of elements in functype[] and param[].
 + *   int nodeid
 + *      the node id (if parallel machines)
 + *   int atnr
 + *      the number of atomtypes
 + *   t_functype *functype
 + *	array of length ntypes, defines for every force type what type of
 + *      function to use. Every "bond" with the same function but different
 + *	force parameters is a different force type. The type identifier in the
 + *	forceatoms[] array is an index in this array.
 + *   t_iparams *iparams
 + *	array of length ntypes, defines the parameters for every interaction
 + *      type. The type identifier in the actual interaction list
 + *      (ilist[ftype].iatoms[]) is an index in this array.
 + *   gmx_cmap_t cmap_grid
 + *      the grid for the dihedral pair correction maps.
 + *   t_iparams *iparams_posres, *iparams_fbposres
 + *	defines the parameters for position restraints only.
 + *      Position restraints are the only interactions that have different
 + *      parameters (reference positions) for different molecules
 + *      of the same type. ilist[F_POSRES].iatoms[] is an index in this array.
 + *   t_ilist il[F_NRE]
 + *      The list of interactions for each type. Note that some,
 + *      such as LJ and COUL will have 0 entries.
++ *   int ilsort
++ *      The state of the sorting of il, values are provided above.
++ *   int nthreads
++ *      The number of threads used to set il_thread_division.
++ *   int *il_thread_division
++ *      The division of the normal bonded interactions of threads.
++ *      il_thread_division[ftype*(nthreads+1)+t] contains an index
++ *      into il[ftype].iatoms; thread th operates on t=th to t=th+1.
++ *   int il_thread_division_nalloc
++ *      The allocated size of il_thread_division,
++ *      should be at least F_NRE*(nthreads+1).
 + */
 +
 +typedef struct {
 +    int   n;      /* n+1 is the number of points */
 +    real  scale;  /* distance between two points */
 +    real *data;   /* the actual table data, per point there are 4 numbers */
 +} bondedtable_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif
diff --cc src/gromacs/mdlib/domdec.c
index e2329ddc15,0000000000..846d85e114
mode 100644,000000..100644
--- a/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.c
@@@ -1,9724 -1,0 +1,9724 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            for (i = 0; i < state_local->nrng; i++)
 +                            {
 +                                state->ld_rng[i] = state_local->ld_rng[i];
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
 +                                  state_local->ld_rng, state->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            state->ld_rngi[0] = state_local->ld_rngi[0];
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, sizeof(state->ld_rngi[0]),
 +                                  state_local->ld_rngi, state->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd,
 +                                  state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                  state->ld_rng, state_local->ld_rng);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                   state->ld_rng, state_local->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
 +                                  state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
 +                                   state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], format[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
 +                                ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +    sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out, strlen(atomname) < 4 ? format : format4,
 +                "ATOM", (ii+1)%100000,
 +                atomname, resname, ' ', resnr%10000, ' ',
 +                10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +void get_pme_nnodes(const gmx_domdec_t *dd,
 +                    int *npmenodes_x, int *npmenodes_y)
 +{
 +    if (dd != NULL)
 +    {
 +        *npmenodes_x = dd->comm->npmenodes_x;
 +        *npmenodes_y = dd->comm->npmenodes_y;
 +    }
 +    else
 +    {
 +        *npmenodes_x = 1;
 +        *npmenodes_y = 1;
 +    }
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP nodes:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +    /* zone_ncg1[0] should always be equal to ncg_home */
 +    dd->comm->zone_ncg1[0] = dd->ncg_home;
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        /* The cut-off might have changed, e.g. by PME load balacning,
 +         * from the value used to set comm->cellsize_min, so check it.
 +         */
 +        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            /* Check for the cut-off limit set by the PME load balancing */
 +            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for (j = 0; j < dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d], dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_large_int_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
 +        set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the nodes: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME nodes\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
 +    comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
 +    recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
 +    comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
 +    comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_large_int_t    step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
 +    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +        ncgindex_set = 0;
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
 +        ncgindex_set = dd->ncg_home;
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
 +        ncgindex_set = 0;
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr,
 +                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        ga2la_clear(dd->ga2la);
 +        ncgindex_set = 0;
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
 +    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
-     init_bonded_thread_force_reduction(fr, &top_local->idef);
++    setup_bonded_threading(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA, mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
diff --cc src/gromacs/mdlib/minimize.c
index 5fa7e693cd,0000000000..c15fe93bd5
mode 100644,000000..100644
--- a/src/gromacs/mdlib/minimize.c
+++ b/src/gromacs/mdlib/minimize.c
@@@ -1,2865 -1,0 +1,2865 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <time.h>
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "string2.h"
 +#include "network.h"
 +#include "confio.h"
 +#include "smalloc.h"
 +#include "nrnb.h"
 +#include "main.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "random.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "typedefs.h"
 +#include "update.h"
 +#include "constr.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "tgroup.h"
 +#include "mdebin.h"
 +#include "vsite.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "trnio.h"
 +#include "mdatoms.h"
 +#include "ns.h"
 +#include "gmx_wallcycle.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "pme.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +#include "md_logging.h"
 +
 +
 +#include "gromacs/linearalgebra/mtxio.h"
 +#include "gromacs/linearalgebra/sparsematrix.h"
 +
 +typedef struct {
 +    t_state  s;
 +    rvec    *f;
 +    real     epot;
 +    real     fnorm;
 +    real     fmax;
 +    int      a_fmax;
 +} em_state_t;
 +
 +static em_state_t *init_em_state()
 +{
 +    em_state_t *ems;
 +
 +    snew(ems, 1);
 +
 +    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
 +    snew(ems->s.lambda, efptNR);
 +
 +    return ems;
 +}
 +
 +static void print_em_start(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
 +                           gmx_wallcycle_t wcycle,
 +                           const char *name)
 +{
 +    char buf[STRLEN];
 +
 +    runtime_start(runtime);
 +
 +    sprintf(buf, "Started %s", name);
 +    print_date_and_time(fplog, cr->nodeid, buf, NULL);
 +
 +    wallcycle_start(wcycle, ewcRUN);
 +}
 +static void em_time_end(gmx_runtime_t  *runtime,
 +                        gmx_wallcycle_t wcycle)
 +{
 +    wallcycle_stop(wcycle, ewcRUN);
 +
 +    runtime_end(runtime);
 +}
 +
 +static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
 +{
 +    fprintf(out, "\n");
 +    fprintf(out, "%s:\n", minimizer);
 +    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
 +    fprintf(out, "   Number of steps    = %12d\n", nsteps);
 +}
 +
 +static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
 +{
 +    char buffer[2048];
 +    if (bLastStep)
 +    {
 +        sprintf(buffer,
 +                "\nEnergy minimization reached the maximum number "
 +                "of steps before the forces reached the requested "
 +                "precision Fmax < %g.\n", ftol);
 +    }
 +    else
 +    {
 +        sprintf(buffer,
 +                "\nEnergy minimization has stopped, but the forces have "
 +                "not converged to the requested precision Fmax < %g (which "
 +                "may not be possible for your system). It stopped "
 +                "because the algorithm tried to make a new step whose size "
 +                "was too small, or there was no change in the energy since "
 +                "last step. Either way, we regard the minimization as "
 +                "converged to within the available machine precision, "
 +                "given your starting configuration and EM parameters.\n%s%s",
 +                ftol,
 +                sizeof(real) < sizeof(double) ?
 +                "\nDouble precision normally gives you higher accuracy, but "
 +                "this is often not needed for preparing to run molecular "
 +                "dynamics.\n" :
 +                "",
 +                bConstrain ?
 +                "You might need to increase your constraint accuracy, or turn\n"
 +                "off constraints altogether (set constraints = none in mdp file)\n" :
 +                "");
 +    }
 +    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
 +}
 +
 +
 +
 +static void print_converged(FILE *fp, const char *alg, real ftol,
 +                            gmx_large_int_t count, gmx_bool bDone, gmx_large_int_t nsteps,
 +                            real epot, real fmax, int nfmax, real fnorm)
 +{
 +    char buf[STEPSTRSIZE];
 +
 +    if (bDone)
 +    {
 +        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
 +                alg, ftol, gmx_step_str(count, buf));
 +    }
 +    else if (count < nsteps)
 +    {
 +        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
 +                "but did not reach the requested Fmax < %g.\n",
 +                alg, gmx_step_str(count, buf), ftol);
 +    }
 +    else
 +    {
 +        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
 +                alg, ftol, gmx_step_str(count, buf));
 +    }
 +
 +#ifdef GMX_DOUBLE
 +    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
 +    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
 +    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
 +#else
 +    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
 +    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
 +    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
 +#endif
 +}
 +
 +static void get_f_norm_max(t_commrec *cr,
 +                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
 +                           real *fnorm, real *fmax, int *a_fmax)
 +{
 +    double fnorm2, *sum;
 +    real   fmax2, fmax2_0, fam;
 +    int    la_max, a_max, start, end, i, m, gf;
 +
 +    /* This routine finds the largest force and returns it.
 +     * On parallel machines the global max is taken.
 +     */
 +    fnorm2 = 0;
 +    fmax2  = 0;
 +    la_max = -1;
 +    gf     = 0;
 +    start  = mdatoms->start;
 +    end    = mdatoms->homenr + start;
 +    if (mdatoms->cFREEZE)
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            gf  = mdatoms->cFREEZE[i];
 +            fam = 0;
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    fam += sqr(f[i][m]);
 +                }
 +            }
 +            fnorm2 += fam;
 +            if (fam > fmax2)
 +            {
 +                fmax2  = fam;
 +                la_max = i;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            fam     = norm2(f[i]);
 +            fnorm2 += fam;
 +            if (fam > fmax2)
 +            {
 +                fmax2  = fam;
 +                la_max = i;
 +            }
 +        }
 +    }
 +
 +    if (la_max >= 0 && DOMAINDECOMP(cr))
 +    {
 +        a_max = cr->dd->gatindex[la_max];
 +    }
 +    else
 +    {
 +        a_max = la_max;
 +    }
 +    if (PAR(cr))
 +    {
 +        snew(sum, 2*cr->nnodes+1);
 +        sum[2*cr->nodeid]   = fmax2;
 +        sum[2*cr->nodeid+1] = a_max;
 +        sum[2*cr->nnodes]   = fnorm2;
 +        gmx_sumd(2*cr->nnodes+1, sum, cr);
 +        fnorm2 = sum[2*cr->nnodes];
 +        /* Determine the global maximum */
 +        for (i = 0; i < cr->nnodes; i++)
 +        {
 +            if (sum[2*i] > fmax2)
 +            {
 +                fmax2 = sum[2*i];
 +                a_max = (int)(sum[2*i+1] + 0.5);
 +            }
 +        }
 +        sfree(sum);
 +    }
 +
 +    if (fnorm)
 +    {
 +        *fnorm = sqrt(fnorm2);
 +    }
 +    if (fmax)
 +    {
 +        *fmax  = sqrt(fmax2);
 +    }
 +    if (a_fmax)
 +    {
 +        *a_fmax = a_max;
 +    }
 +}
 +
 +static void get_state_f_norm_max(t_commrec *cr,
 +                                 t_grpopts *opts, t_mdatoms *mdatoms,
 +                                 em_state_t *ems)
 +{
 +    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
 +}
 +
 +void init_em(FILE *fplog, const char *title,
 +             t_commrec *cr, t_inputrec *ir,
 +             t_state *state_global, gmx_mtop_t *top_global,
 +             em_state_t *ems, gmx_localtop_t **top,
 +             rvec **f, rvec **f_global,
 +             t_nrnb *nrnb, rvec mu_tot,
 +             t_forcerec *fr, gmx_enerdata_t **enerd,
 +             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int nfile, const t_filenm fnm[],
 +             gmx_mdoutf_t **outf, t_mdebin **mdebin)
 +{
 +    int  start, homenr, i;
 +    real dvdl_constr;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Initiating %s\n", title);
 +    }
 +
 +    state_global->ngtc = 0;
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
 +
 +    init_nrnb(nrnb);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        *top = dd_init_local_top(top_global);
 +
 +        dd_init_local_state(cr->dd, state_global, &ems->s);
 +
 +        *f = NULL;
 +
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            &ems->s, &ems->f, mdatoms, *top,
 +                            fr, vsite, NULL, constr,
 +                            nrnb, NULL, FALSE);
 +        dd_store_state(cr->dd, &ems->s);
 +
 +        if (ir->nstfout)
 +        {
 +            snew(*f_global, top_global->natoms);
 +        }
 +        else
 +        {
 +            *f_global = NULL;
 +        }
 +        *graph = NULL;
 +    }
 +    else
 +    {
 +        snew(*f, top_global->natoms);
 +
 +        /* Just copy the state */
 +        ems->s = *state_global;
 +        snew(ems->s.x, ems->s.nalloc);
 +        snew(ems->f, ems->s.nalloc);
 +        for (i = 0; i < state_global->natoms; i++)
 +        {
 +            copy_rvec(state_global->x[i], ems->s.x[i]);
 +        }
 +        copy_mat(state_global->box, ems->s.box);
 +
 +        if (PAR(cr) && ir->eI != eiNM)
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            *top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +        }
 +        else
 +        {
 +            *top = gmx_mtop_generate_local_top(top_global, ir);
 +        }
 +        *f_global = *f;
 +
 +        forcerec_set_excl_load(fr, *top, cr);
 +
-         init_bonded_thread_force_reduction(fr, &(*top)->idef);
++        setup_bonded_threading(fr, &(*top)->idef);
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +        else
 +        {
 +            *graph = NULL;
 +        }
 +
 +        if (PARTDECOMP(cr))
 +        {
 +            pd_at_range(cr, &start, &homenr);
 +            homenr -= start;
 +        }
 +        else
 +        {
 +            start  = 0;
 +            homenr = top_global->natoms;
 +        }
 +        atoms2md(top_global, ir, 0, NULL, start, homenr, mdatoms);
 +        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, *top, mdatoms, cr);
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        if (ir->eConstrAlg == econtSHAKE &&
 +            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
 +        {
 +            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
 +                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
 +        }
 +
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, *top, ir, mdatoms, cr);
 +        }
 +
 +        if (!ir->bContinuation)
 +        {
 +            /* Constrain the starting coordinates */
 +            dvdl_constr = 0;
 +            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
 +                      ir, NULL, cr, -1, 0, mdatoms,
 +                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
 +                      ems->s.lambda[efptFEP], &dvdl_constr,
 +                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        *gstat = global_stat_init(ir);
 +    }
 +
 +    *outf = init_mdoutf(nfile, fnm, 0, cr, ir, NULL);
 +
 +    snew(*enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  *enerd);
 +
 +    if (mdebin != NULL)
 +    {
 +        /* Init bin for energy stuff */
 +        *mdebin = init_mdebin((*outf)->fp_ene, top_global, ir, NULL);
 +    }
 +
 +    clear_rvec(mu_tot);
 +    calc_shifts(ems->s.box, fr->shift_vec);
 +}
 +
 +static void finish_em(t_commrec *cr, gmx_mdoutf_t *outf,
 +                      gmx_runtime_t *runtime, gmx_wallcycle_t wcycle)
 +{
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    em_time_end(runtime, wcycle);
 +}
 +
 +static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
 +{
 +    em_state_t tmp;
 +
 +    tmp   = *ems1;
 +    *ems1 = *ems2;
 +    *ems2 = tmp;
 +}
 +
 +static void copy_em_coords(em_state_t *ems, t_state *state)
 +{
 +    int i;
 +
 +    for (i = 0; (i < state->natoms); i++)
 +    {
 +        copy_rvec(ems->s.x[i], state->x[i]);
 +    }
 +}
 +
 +static void write_em_traj(FILE *fplog, t_commrec *cr,
 +                          gmx_mdoutf_t *outf,
 +                          gmx_bool bX, gmx_bool bF, const char *confout,
 +                          gmx_mtop_t *top_global,
 +                          t_inputrec *ir, gmx_large_int_t step,
 +                          em_state_t *state,
 +                          t_state *state_global, rvec *f_global)
 +{
 +    int mdof_flags;
 +
 +    if ((bX || bF || confout != NULL) && !DOMAINDECOMP(cr))
 +    {
 +        copy_em_coords(state, state_global);
 +        f_global = state->f;
 +    }
 +
 +    mdof_flags = 0;
 +    if (bX)
 +    {
 +        mdof_flags |= MDOF_X;
 +    }
 +    if (bF)
 +    {
 +        mdof_flags |= MDOF_F;
 +    }
 +    write_traj(fplog, cr, outf, mdof_flags,
 +               top_global, step, (double)step,
 +               &state->s, state_global, state->f, f_global, NULL, NULL);
 +
 +    if (confout != NULL && MASTER(cr))
 +    {
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
 +        {
 +            /* Make molecules whole only for confout writing */
 +            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
 +                        state_global->x);
 +        }
 +
 +        write_sto_conf_mtop(confout,
 +                            *top_global->name, top_global,
 +                            state_global->x, NULL, ir->ePBC, state_global->box);
 +    }
 +}
 +
 +static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
 +                       gmx_bool bMolPBC,
 +                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
 +                       gmx_constr_t constr, gmx_localtop_t *top,
 +                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                       gmx_large_int_t count)
 +
 +{
 +    t_state *s1, *s2;
 +    int      i;
 +    int      start, end;
 +    rvec    *x1, *x2;
 +    real     dvdl_constr;
 +
 +    s1 = &ems1->s;
 +    s2 = &ems2->s;
 +
 +    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
 +    {
 +        gmx_incons("state mismatch in do_em_step");
 +    }
 +
 +    s2->flags = s1->flags;
 +
 +    if (s2->nalloc != s1->nalloc)
 +    {
 +        s2->nalloc = s1->nalloc;
 +        srenew(s2->x, s1->nalloc);
 +        srenew(ems2->f,  s1->nalloc);
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            srenew(s2->cg_p,  s1->nalloc);
 +        }
 +    }
 +
 +    s2->natoms = s1->natoms;
 +    copy_mat(s1->box, s2->box);
 +    /* Copy free energy state */
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        s2->lambda[i] = s1->lambda[i];
 +    }
 +    copy_mat(s1->box, s2->box);
 +
 +    start = md->start;
 +    end   = md->start + md->homenr;
 +
 +    x1 = s1->x;
 +    x2 = s2->x;
 +
 +#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
 +    {
 +        int gf, i, m;
 +
 +        gf = 0;
 +#pragma omp for schedule(static) nowait
 +        for (i = start; i < end; i++)
 +        {
 +            if (md->cFREEZE)
 +            {
 +                gf = md->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (ir->opts.nFreeze[gf][m])
 +                {
 +                    x2[i][m] = x1[i][m];
 +                }
 +                else
 +                {
 +                    x2[i][m] = x1[i][m] + a*f[i][m];
 +                }
 +            }
 +        }
 +
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            /* Copy the CG p vector */
 +            x1 = s1->cg_p;
 +            x2 = s2->cg_p;
 +#pragma omp for schedule(static) nowait
 +            for (i = start; i < end; i++)
 +            {
 +                copy_rvec(x1[i], x2[i]);
 +            }
 +        }
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            s2->ddp_count = s1->ddp_count;
 +            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
 +            {
 +#pragma omp barrier
 +                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
 +                srenew(s2->cg_gl, s2->cg_gl_nalloc);
 +#pragma omp barrier
 +            }
 +            s2->ncg_gl = s1->ncg_gl;
 +#pragma omp for schedule(static) nowait
 +            for (i = 0; i < s2->ncg_gl; i++)
 +            {
 +                s2->cg_gl[i] = s1->cg_gl[i];
 +            }
 +            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        wallcycle_start(wcycle, ewcCONSTR);
 +        dvdl_constr = 0;
 +        constrain(NULL, TRUE, TRUE, constr, &top->idef,
 +                  ir, NULL, cr, count, 0, md,
 +                  s1->x, s2->x, NULL, bMolPBC, s2->box,
 +                  s2->lambda[efptBONDED], &dvdl_constr,
 +                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +    }
 +}
 +
 +static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
 +                                   gmx_mtop_t *top_global, t_inputrec *ir,
 +                                   em_state_t *ems, gmx_localtop_t *top,
 +                                   t_mdatoms *mdatoms, t_forcerec *fr,
 +                                   gmx_vsite_t *vsite, gmx_constr_t constr,
 +                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
 +{
 +    /* Repartition the domain decomposition */
 +    wallcycle_start(wcycle, ewcDOMDEC);
 +    dd_partition_system(fplog, step, cr, FALSE, 1,
 +                        NULL, top_global, ir,
 +                        &ems->s, &ems->f,
 +                        mdatoms, top, fr, vsite, NULL, constr,
 +                        nrnb, wcycle, FALSE);
 +    dd_store_state(cr->dd, &ems->s);
 +    wallcycle_stop(wcycle, ewcDOMDEC);
 +}
 +
 +static void evaluate_energy(FILE *fplog, t_commrec *cr,
 +                            gmx_mtop_t *top_global,
 +                            em_state_t *ems, gmx_localtop_t *top,
 +                            t_inputrec *inputrec,
 +                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                            gmx_global_stat_t gstat,
 +                            gmx_vsite_t *vsite, gmx_constr_t constr,
 +                            t_fcdata *fcd,
 +                            t_graph *graph, t_mdatoms *mdatoms,
 +                            t_forcerec *fr, rvec mu_tot,
 +                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
 +                            gmx_large_int_t count, gmx_bool bFirst)
 +{
 +    real     t;
 +    gmx_bool bNS;
 +    int      nabnsb;
 +    tensor   force_vir, shake_vir, ekin;
 +    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
 +    real     terminate = 0;
 +
 +    /* Set the time to the initial time, the time does not change during EM */
 +    t = inputrec->init_t;
 +
 +    if (bFirst ||
 +        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
 +    {
 +        /* This the first state or an old state used before the last ns */
 +        bNS = TRUE;
 +    }
 +    else
 +    {
 +        bNS = FALSE;
 +        if (inputrec->nstlist > 0)
 +        {
 +            bNS = TRUE;
 +        }
 +        else if (inputrec->nstlist == -1)
 +        {
 +            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
 +            if (PAR(cr))
 +            {
 +                gmx_sumi(1, &nabnsb, cr);
 +            }
 +            bNS = (nabnsb > 0);
 +        }
 +    }
 +
 +    if (vsite)
 +    {
 +        construct_vsites(vsite, ems->s.x, 1, NULL,
 +                         top->idef.iparams, top->idef.il,
 +                         fr->ePBC, fr->bMolPBC, graph, cr, ems->s.box);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (bNS)
 +        {
 +            /* Repartition the domain decomposition */
 +            em_dd_partition_system(fplog, count, cr, top_global, inputrec,
 +                                   ems, top, mdatoms, fr, vsite, constr,
 +                                   nrnb, wcycle);
 +        }
 +    }
 +
 +    /* Calc force & energy on new trial position  */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    do_force(fplog, cr, inputrec,
 +             count, nrnb, wcycle, top, &top_global->groups,
 +             ems->s.box, ems->s.x, &ems->s.hist,
 +             ems->f, force_vir, mdatoms, enerd, fcd,
 +             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
 +             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
 +             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
 +             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
 +
 +    /* Clear the unused shake virial and pressure */
 +    clear_mat(shake_vir);
 +    clear_mat(pres);
 +
 +    /* Communicate stuff when parallel */
 +    if (PAR(cr) && inputrec->eI != eiNM)
 +    {
 +        wallcycle_start(wcycle, ewcMoveE);
 +
 +        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
 +                    inputrec, NULL, NULL, NULL, 1, &terminate,
 +                    top_global, &ems->s, FALSE,
 +                    CGLO_ENERGY |
 +                    CGLO_PRESSURE |
 +                    CGLO_CONSTRAINT |
 +                    CGLO_FIRSTITERATE);
 +
 +        wallcycle_stop(wcycle, ewcMoveE);
 +    }
 +
 +    /* Calculate long range corrections to pressure and energy */
 +    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
 +                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
 +    enerd->term[F_DISPCORR] = enercorr;
 +    enerd->term[F_EPOT]    += enercorr;
 +    enerd->term[F_PRES]    += prescorr;
 +    enerd->term[F_DVDL]    += dvdlcorr;
 +
 +    ems->epot = enerd->term[F_EPOT];
 +
 +    if (constr)
 +    {
 +        /* Project out the constraint components of the force */
 +        wallcycle_start(wcycle, ewcCONSTR);
 +        dvdl_constr = 0;
 +        constrain(NULL, FALSE, FALSE, constr, &top->idef,
 +                  inputrec, NULL, cr, count, 0, mdatoms,
 +                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
 +                  ems->s.lambda[efptBONDED], &dvdl_constr,
 +                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
 +        if (fr->bSepDVDL && fplog)
 +        {
 +            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
 +        }
 +        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +        m_add(force_vir, shake_vir, vir);
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +    }
 +    else
 +    {
 +        copy_mat(force_vir, vir);
 +    }
 +
 +    clear_mat(ekin);
 +    enerd->term[F_PRES] =
 +        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
 +
 +    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
 +
 +    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
 +    {
 +        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
 +    }
 +}
 +
 +static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
 +                              gmx_mtop_t *mtop,
 +                              em_state_t *s_min, em_state_t *s_b)
 +{
 +    rvec          *fm, *fb, *fmg;
 +    t_block       *cgs_gl;
 +    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
 +    double         partsum;
 +    unsigned char *grpnrFREEZE;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Doing reorder_partsum\n");
 +    }
 +
 +    fm = s_min->f;
 +    fb = s_b->f;
 +
 +    cgs_gl = dd_charge_groups_global(cr->dd);
 +    index  = cgs_gl->index;
 +
 +    /* Collect fm in a global vector fmg.
 +     * This conflicts with the spirit of domain decomposition,
 +     * but to fully optimize this a much more complicated algorithm is required.
 +     */
 +    snew(fmg, mtop->natoms);
 +
 +    ncg   = s_min->s.ncg_gl;
 +    cg_gl = s_min->s.cg_gl;
 +    i     = 0;
 +    for (c = 0; c < ncg; c++)
 +    {
 +        cg = cg_gl[c];
 +        a0 = index[cg];
 +        a1 = index[cg+1];
 +        for (a = a0; a < a1; a++)
 +        {
 +            copy_rvec(fm[i], fmg[a]);
 +            i++;
 +        }
 +    }
 +    gmx_sum(mtop->natoms*3, fmg[0], cr);
 +
 +    /* Now we will determine the part of the sum for the cgs in state s_b */
 +    ncg         = s_b->s.ncg_gl;
 +    cg_gl       = s_b->s.cg_gl;
 +    partsum     = 0;
 +    i           = 0;
 +    gf          = 0;
 +    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
 +    for (c = 0; c < ncg; c++)
 +    {
 +        cg = cg_gl[c];
 +        a0 = index[cg];
 +        a1 = index[cg+1];
 +        for (a = a0; a < a1; a++)
 +        {
 +            if (mdatoms->cFREEZE && grpnrFREEZE)
 +            {
 +                gf = grpnrFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
 +                }
 +            }
 +            i++;
 +        }
 +    }
 +
 +    sfree(fmg);
 +
 +    return partsum;
 +}
 +
 +static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
 +                    gmx_mtop_t *mtop,
 +                    em_state_t *s_min, em_state_t *s_b)
 +{
 +    rvec  *fm, *fb;
 +    double sum;
 +    int    gf, i, m;
 +
 +    /* This is just the classical Polak-Ribiere calculation of beta;
 +     * it looks a bit complicated since we take freeze groups into account,
 +     * and might have to sum it in parallel runs.
 +     */
 +
 +    if (!DOMAINDECOMP(cr) ||
 +        (s_min->s.ddp_count == cr->dd->ddp_count &&
 +         s_b->s.ddp_count   == cr->dd->ddp_count))
 +    {
 +        fm  = s_min->f;
 +        fb  = s_b->f;
 +        sum = 0;
 +        gf  = 0;
 +        /* This part of code can be incorrect with DD,
 +         * since the atom ordering in s_b and s_min might differ.
 +         */
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            if (mdatoms->cFREEZE)
 +            {
 +                gf = mdatoms->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* We need to reorder cgs while summing */
 +        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
 +    }
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(1, &sum, cr);
 +    }
 +
 +    return sum/sqr(s_min->fnorm);
 +}
 +
 +double do_cg(FILE *fplog, t_commrec *cr,
 +             int nfile, const t_filenm fnm[],
 +             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
 +             int gmx_unused nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int gmx_unused stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global, t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t gmx_unused ed,
 +             t_forcerec *fr,
 +             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +             gmx_membed_t gmx_unused membed,
 +             real gmx_unused cpt_period, real gmx_unused max_hours,
 +             const char gmx_unused *deviceOptions,
 +             unsigned long gmx_unused Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char       *CG = "Polak-Ribiere Conjugate Gradients";
 +
 +    em_state_t       *s_min, *s_a, *s_b, *s_c;
 +    gmx_localtop_t   *top;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f;
 +    gmx_global_stat_t gstat;
 +    t_graph          *graph;
 +    rvec             *f_global, *p, *sf, *sfm;
 +    double            gpa, gpb, gpc, tmp, sum[2], minstep;
 +    real              fnormn;
 +    real              stepsize;
 +    real              a, b, c, beta = 0.0;
 +    real              epot_repl = 0;
 +    real              pnorm;
 +    t_mdebin         *mdebin;
 +    gmx_bool          converged, foundlower;
 +    rvec              mu_tot;
 +    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
 +    tensor            vir, pres;
 +    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
 +    gmx_mdoutf_t     *outf;
 +    int               i, m, gf, step, nminstep;
 +    real              terminate = 0;
 +
 +    step = 0;
 +
 +    s_min = init_em_state();
 +    s_a   = init_em_state();
 +    s_b   = init_em_state();
 +    s_c   = init_em_state();
 +
 +    /* Init em and store the local state in s_min */
 +    init_em(fplog, CG, cr, inputrec,
 +            state_global, top_global, s_min, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +
 +    /* Print to log file */
 +    print_em_start(fplog, cr, runtime, wcycle, CG);
 +
 +    /* Max number of steps */
 +    number_steps = inputrec->nsteps;
 +
 +    if (MASTER(cr))
 +    {
 +        sp_header(stderr, CG, inputrec->em_tol, number_steps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, CG, inputrec->em_tol, number_steps);
 +    }
 +
 +    /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    evaluate_energy(fplog, cr,
 +                    top_global, s_min, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    where();
 +
 +    if (MASTER(cr))
 +    {
 +        /* Copy stuff to the energy bin for easy printing etc. */
 +        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
 +                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +    where();
 +
 +    /* Estimate/guess the initial stepsize */
 +    stepsize = inputrec->em_stepsize/s_min->fnorm;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
 +                s_min->fmax, s_min->a_fmax+1);
 +        fprintf(stderr, "   F-Norm            = %12.5e\n",
 +                s_min->fnorm/sqrt(state_global->natoms));
 +        fprintf(stderr, "\n");
 +        /* and copy to the log file too... */
 +        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
 +                s_min->fmax, s_min->a_fmax+1);
 +        fprintf(fplog, "   F-Norm            = %12.5e\n",
 +                s_min->fnorm/sqrt(state_global->natoms));
 +        fprintf(fplog, "\n");
 +    }
 +    /* Start the loop over CG steps.
 +     * Each successful step is counted, and we continue until
 +     * we either converge or reach the max number of steps.
 +     */
 +    converged = FALSE;
 +    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
 +    {
 +
 +        /* start taking steps in a new direction
 +         * First time we enter the routine, beta=0, and the direction is
 +         * simply the negative gradient.
 +         */
 +
 +        /* Calculate the new direction in p, and the gradient in this direction, gpa */
 +        p   = s_min->s.cg_p;
 +        sf  = s_min->f;
 +        gpa = 0;
 +        gf  = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            if (mdatoms->cFREEZE)
 +            {
 +                gf = mdatoms->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!inputrec->opts.nFreeze[gf][m])
 +                {
 +                    p[i][m] = sf[i][m] + beta*p[i][m];
 +                    gpa    -= p[i][m]*sf[i][m];
 +                    /* f is negative gradient, thus the sign */
 +                }
 +                else
 +                {
 +                    p[i][m] = 0;
 +                }
 +            }
 +        }
 +
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpa, cr);
 +        }
 +
 +        /* Calculate the norm of the search vector */
 +        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
 +
 +        /* Just in case stepsize reaches zero due to numerical precision... */
 +        if (stepsize <= 0)
 +        {
 +            stepsize = inputrec->em_stepsize/pnorm;
 +        }
 +
 +        /*
 +         * Double check the value of the derivative in the search direction.
 +         * If it is positive it must be due to the old information in the
 +         * CG formula, so just remove that and start over with beta=0.
 +         * This corresponds to a steepest descent step.
 +         */
 +        if (gpa > 0)
 +        {
 +            beta = 0;
 +            step--;   /* Don't count this step since we are restarting */
 +            continue; /* Go back to the beginning of the big for-loop */
 +        }
 +
 +        /* Calculate minimum allowed stepsize, before the average (norm)
 +         * relative change in coordinate is smaller than precision
 +         */
 +        minstep = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                tmp = fabs(s_min->s.x[i][m]);
 +                if (tmp < 1.0)
 +                {
 +                    tmp = 1.0;
 +                }
 +                tmp      = p[i][m]/tmp;
 +                minstep += tmp*tmp;
 +            }
 +        }
 +        /* Add up from all CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &minstep, cr);
 +        }
 +
 +        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
 +
 +        if (stepsize < minstep)
 +        {
 +            converged = TRUE;
 +            break;
 +        }
 +
 +        /* Write coordinates if necessary */
 +        do_x = do_per_step(step, inputrec->nstxout);
 +        do_f = do_per_step(step, inputrec->nstfout);
 +
 +        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
 +                      top_global, inputrec, step,
 +                      s_min, state_global, f_global);
 +
 +        /* Take a step downhill.
 +         * In theory, we should minimize the function along this direction.
 +         * That is quite possible, but it turns out to take 5-10 function evaluations
 +         * for each line. However, we dont really need to find the exact minimum -
 +         * it is much better to start a new CG step in a modified direction as soon
 +         * as we are close to it. This will save a lot of energy evaluations.
 +         *
 +         * In practice, we just try to take a single step.
 +         * If it worked (i.e. lowered the energy), we increase the stepsize but
 +         * the continue straight to the next CG step without trying to find any minimum.
 +         * If it didn't work (higher energy), there must be a minimum somewhere between
 +         * the old position and the new one.
 +         *
 +         * Due to the finite numerical accuracy, it turns out that it is a good idea
 +         * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +         * This leads to lower final energies in the tests I've done. / Erik
 +         */
 +        s_a->epot = s_min->epot;
 +        a         = 0.0;
 +        c         = a + stepsize; /* reference position along line is zero */
 +
 +        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
 +        {
 +            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
 +                                   s_min, top, mdatoms, fr, vsite, constr,
 +                                   nrnb, wcycle);
 +        }
 +
 +        /* Take a trial step (new coords in s_c) */
 +        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
 +                   constr, top, nrnb, wcycle, -1);
 +
 +        neval++;
 +        /* Calculate energy for the trial step */
 +        evaluate_energy(fplog, cr,
 +                        top_global, s_c, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, -1, FALSE);
 +
 +        /* Calc derivative along line */
 +        p   = s_c->s.cg_p;
 +        sf  = s_c->f;
 +        gpc = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
 +            }
 +        }
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpc, cr);
 +        }
 +
 +        /* This is the max amount of increase in energy we tolerate */
 +        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
 +
 +        /* Accept the step if the energy is lower, or if it is not significantly higher
 +         * and the line derivative is still negative.
 +         */
 +        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
 +        {
 +            foundlower = TRUE;
 +            /* Great, we found a better energy. Increase step for next iteration
 +             * if we are still going down, decrease it otherwise
 +             */
 +            if (gpc < 0)
 +            {
 +                stepsize *= 1.618034; /* The golden section */
 +            }
 +            else
 +            {
 +                stepsize *= 0.618034; /* 1/golden section */
 +            }
 +        }
 +        else
 +        {
 +            /* New energy is the same or higher. We will have to do some work
 +             * to find a smaller value in the interval. Take smaller step next time!
 +             */
 +            foundlower = FALSE;
 +            stepsize  *= 0.618034;
 +        }
 +
 +
 +
 +
 +        /* OK, if we didn't find a lower value we will have to locate one now - there must
 +         * be one in the interval [a=0,c].
 +         * The same thing is valid here, though: Don't spend dozens of iterations to find
 +         * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +         *
 +         * I also have a safeguard for potentially really patological functions so we never
 +         * take more than 20 steps before we give up ...
 +         *
 +         * If we already found a lower value we just skip this step and continue to the update.
 +         */
 +        if (!foundlower)
 +        {
 +            nminstep = 0;
 +
 +            do
 +            {
 +                /* Select a new trial point.
 +                 * If the derivatives at points a & c have different sign we interpolate to zero,
 +                 * otherwise just do a bisection.
 +                 */
 +                if (gpa < 0 && gpc > 0)
 +                {
 +                    b = a + gpa*(a-c)/(gpc-gpa);
 +                }
 +                else
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* safeguard if interpolation close to machine accuracy causes errors:
 +                 * never go outside the interval
 +                 */
 +                if (b <= a || b >= c)
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
 +                {
 +                    /* Reload the old state */
 +                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
 +                                           s_min, top, mdatoms, fr, vsite, constr,
 +                                           nrnb, wcycle);
 +                }
 +
 +                /* Take a trial step to this new point - new coords in s_b */
 +                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
 +                           constr, top, nrnb, wcycle, -1);
 +
 +                neval++;
 +                /* Calculate energy for the trial step */
 +                evaluate_energy(fplog, cr,
 +                                top_global, s_b, top,
 +                                inputrec, nrnb, wcycle, gstat,
 +                                vsite, constr, fcd, graph, mdatoms, fr,
 +                                mu_tot, enerd, vir, pres, -1, FALSE);
 +
 +                /* p does not change within a step, but since the domain decomposition
 +                 * might change, we have to use cg_p of s_b here.
 +                 */
 +                p   = s_b->s.cg_p;
 +                sf  = s_b->f;
 +                gpb = 0;
 +                for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +                {
 +                    for (m = 0; m < DIM; m++)
 +                    {
 +                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
 +                    }
 +                }
 +                /* Sum the gradient along the line across CPUs */
 +                if (PAR(cr))
 +                {
 +                    gmx_sumd(1, &gpb, cr);
 +                }
 +
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
 +                            s_a->epot, s_b->epot, s_c->epot, gpb);
 +                }
 +
 +                epot_repl = s_b->epot;
 +
 +                /* Keep one of the intervals based on the value of the derivative at the new point */
 +                if (gpb > 0)
 +                {
 +                    /* Replace c endpoint with b */
 +                    swap_em_state(s_b, s_c);
 +                    c   = b;
 +                    gpc = gpb;
 +                }
 +                else
 +                {
 +                    /* Replace a endpoint with b */
 +                    swap_em_state(s_b, s_a);
 +                    a   = b;
 +                    gpa = gpb;
 +                }
 +
 +                /*
 +                 * Stop search as soon as we find a value smaller than the endpoints.
 +                 * Never run more than 20 steps, no matter what.
 +                 */
 +                nminstep++;
 +            }
 +            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
 +                   (nminstep < 20));
 +
 +            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
 +                nminstep >= 20)
 +            {
 +                /* OK. We couldn't find a significantly lower energy.
 +                 * If beta==0 this was steepest descent, and then we give up.
 +                 * If not, set beta=0 and restart with steepest descent before quitting.
 +                 */
 +                if (beta == 0.0)
 +                {
 +                    /* Converged */
 +                    converged = TRUE;
 +                    break;
 +                }
 +                else
 +                {
 +                    /* Reset memory before giving up */
 +                    beta = 0.0;
 +                    continue;
 +                }
 +            }
 +
 +            /* Select min energy state of A & C, put the best in B.
 +             */
 +            if (s_c->epot < s_a->epot)
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
 +                            s_c->epot, s_a->epot);
 +                }
 +                swap_em_state(s_b, s_c);
 +                gpb = gpc;
 +                b   = c;
 +            }
 +            else
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
 +                            s_a->epot, s_c->epot);
 +                }
 +                swap_em_state(s_b, s_a);
 +                gpb = gpa;
 +                b   = a;
 +            }
 +
 +        }
 +        else
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
 +                        s_c->epot);
 +            }
 +            swap_em_state(s_b, s_c);
 +            gpb = gpc;
 +            b   = c;
 +        }
 +
 +        /* new search direction */
 +        /* beta = 0 means forget all memory and restart with steepest descents. */
 +        if (nstcg && ((step % nstcg) == 0))
 +        {
 +            beta = 0.0;
 +        }
 +        else
 +        {
 +            /* s_min->fnorm cannot be zero, because then we would have converged
 +             * and broken out.
 +             */
 +
 +            /* Polak-Ribiere update.
 +             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
 +             */
 +            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
 +        }
 +        /* Limit beta to prevent oscillations */
 +        if (fabs(beta) > 5.0)
 +        {
 +            beta = 0.0;
 +        }
 +
 +
 +        /* update positions */
 +        swap_em_state(s_min, s_b);
 +        gpa = gpb;
 +
 +        /* Print it if necessary */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
 +                        s_min->fmax, s_min->a_fmax+1);
 +            }
 +            /* Store the new (lower) energies */
 +            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
 +                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +            do_log = do_per_step(step, inputrec->nstlog);
 +            do_ene = do_per_step(step, inputrec->nstenergy);
 +            if (do_log)
 +            {
 +                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +            }
 +            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
 +                       do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +
 +        /* Stop when the maximum force lies below tolerance.
 +         * If we have reached machine precision, converged is already set to true.
 +         */
 +        converged = converged || (s_min->fmax < inputrec->em_tol);
 +
 +    } /* End of the loop */
 +
 +    if (converged)
 +    {
 +        step--; /* we never took that last step in this case */
 +
 +    }
 +    if (s_min->fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
 +            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        /* If we printed energy and/or logfile last step (which was the last step)
 +         * we don't have to do it again, but otherwise print the final values.
 +         */
 +        if (!do_log)
 +        {
 +            /* Write final value to log since we didn't do anything the last step */
 +            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +        }
 +        if (!do_ene || !do_log)
 +        {
 +            /* Write final energy file entries */
 +            print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
 +                       !do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +    }
 +
 +    /* Print some stuff... */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +
 +    /* IMPORTANT!
 +     * For accurate normal mode calculation it is imperative that we
 +     * store the last conformation into the full precision binary trajectory.
 +     *
 +     * However, we should only do it if we did NOT already write this step
 +     * above (which we did if do_x or do_f was true).
 +     */
 +    do_x = !do_per_step(step, inputrec->nstxout);
 +    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
 +
 +    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, step,
 +                  s_min, state_global, f_global);
 +
 +    fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +
 +        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    runtime->nsteps_done = step;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_lbfgs(FILE *fplog, t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
 +                int gmx_unused nstglobalcomm,
 +                gmx_vsite_t *vsite, gmx_constr_t constr,
 +                int gmx_unused stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global, t_fcdata *fcd,
 +                t_state *state,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                gmx_edsam_t gmx_unused ed,
 +                t_forcerec *fr,
 +                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +                gmx_membed_t gmx_unused membed,
 +                real gmx_unused cpt_period, real gmx_unused max_hours,
 +                const char gmx_unused *deviceOptions,
 +                unsigned long gmx_unused Flags,
 +                gmx_runtime_t *runtime)
 +{
 +    static const char *LBFGS = "Low-Memory BFGS Minimizer";
 +    em_state_t         ems;
 +    gmx_localtop_t    *top;
 +    gmx_enerdata_t    *enerd;
 +    rvec              *f;
 +    gmx_global_stat_t  gstat;
 +    t_graph           *graph;
 +    rvec              *f_global;
 +    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
 +    double             stepsize, gpa, gpb, gpc, tmp, minstep;
 +    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
 +    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
 +    real               a, b, c, maxdelta, delta;
 +    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
 +    real               dgdx, dgdg, sq, yr, beta;
 +    t_mdebin          *mdebin;
 +    gmx_bool           converged, first;
 +    rvec               mu_tot;
 +    real               fnorm, fmax;
 +    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
 +    tensor             vir, pres;
 +    int                start, end, number_steps;
 +    gmx_mdoutf_t      *outf;
 +    int                i, k, m, n, nfmax, gf, step;
 +    int                mdof_flags;
 +    /* not used */
 +    real               terminate;
 +
 +    if (PAR(cr))
 +    {
 +        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
 +    }
 +
 +    if (NULL != constr)
 +    {
 +        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
 +    }
 +
 +    n        = 3*state->natoms;
 +    nmaxcorr = inputrec->nbfgscorr;
 +
 +    /* Allocate memory */
 +    /* Use pointers to real so we dont have to loop over both atoms and
 +     * dimensions all the time...
 +     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
 +     * that point to the same memory.
 +     */
 +    snew(xa, n);
 +    snew(xb, n);
 +    snew(xc, n);
 +    snew(fa, n);
 +    snew(fb, n);
 +    snew(fc, n);
 +    snew(frozen, n);
 +
 +    snew(p, n);
 +    snew(lastx, n);
 +    snew(lastf, n);
 +    snew(rho, nmaxcorr);
 +    snew(alpha, nmaxcorr);
 +
 +    snew(dx, nmaxcorr);
 +    for (i = 0; i < nmaxcorr; i++)
 +    {
 +        snew(dx[i], n);
 +    }
 +
 +    snew(dg, nmaxcorr);
 +    for (i = 0; i < nmaxcorr; i++)
 +    {
 +        snew(dg[i], n);
 +    }
 +
 +    step  = 0;
 +    neval = 0;
 +
 +    /* Init em */
 +    init_em(fplog, LBFGS, cr, inputrec,
 +            state, top_global, &ems, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +    /* Do_lbfgs is not completely updated like do_steep and do_cg,
 +     * so we free some memory again.
 +     */
 +    sfree(ems.s.x);
 +    sfree(ems.f);
 +
 +    xx = (real *)state->x;
 +    ff = (real *)f;
 +
 +    start = mdatoms->start;
 +    end   = mdatoms->homenr + start;
 +
 +    /* Print to log file */
 +    print_em_start(fplog, cr, runtime, wcycle, LBFGS);
 +
 +    do_log = do_ene = do_x = do_f = TRUE;
 +
 +    /* Max number of steps */
 +    number_steps = inputrec->nsteps;
 +
 +    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
 +    gf = 0;
 +    for (i = start; i < end; i++)
 +    {
 +        if (mdatoms->cFREEZE)
 +        {
 +            gf = mdatoms->cFREEZE[i];
 +        }
 +        for (m = 0; m < DIM; m++)
 +        {
 +            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
 +        }
 +    }
 +    if (MASTER(cr))
 +    {
 +        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
 +    }
 +
 +    if (vsite)
 +    {
 +        construct_vsites(vsite, state->x, 1, NULL,
 +                         top->idef.iparams, top->idef.il,
 +                         fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +    }
 +
 +    /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole
 +     */
 +    neval++;
 +    ems.s.x = state->x;
 +    ems.f   = f;
 +    evaluate_energy(fplog, cr,
 +                    top_global, &ems, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    where();
 +
 +    if (MASTER(cr))
 +    {
 +        /* Copy stuff to the energy bin for easy printing etc. */
 +        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
 +                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +    where();
 +
 +    /* This is the starting energy */
 +    Epot = enerd->term[F_EPOT];
 +
 +    fnorm = ems.fnorm;
 +    fmax  = ems.fmax;
 +    nfmax = ems.a_fmax;
 +
 +    /* Set the initial step.
 +     * since it will be multiplied by the non-normalized search direction
 +     * vector (force vector the first time), we scale it by the
 +     * norm of the force.
 +     */
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
 +        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
 +        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
 +        fprintf(stderr, "\n");
 +        /* and copy to the log file too... */
 +        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
 +        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
 +        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
 +        fprintf(fplog, "\n");
 +    }
 +
 +    point = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        if (!frozen[i])
 +        {
 +            dx[point][i] = ff[i]; /* Initial search direction */
 +        }
 +        else
 +        {
 +            dx[point][i] = 0;
 +        }
 +    }
 +
 +    stepsize  = 1.0/fnorm;
 +    converged = FALSE;
 +
 +    /* Start the loop over BFGS steps.
 +     * Each successful step is counted, and we continue until
 +     * we either converge or reach the max number of steps.
 +     */
 +
 +    ncorr = 0;
 +
 +    /* Set the gradient from the force */
 +    converged = FALSE;
 +    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
 +    {
 +
 +        /* Write coordinates if necessary */
 +        do_x = do_per_step(step, inputrec->nstxout);
 +        do_f = do_per_step(step, inputrec->nstfout);
 +
 +        mdof_flags = 0;
 +        if (do_x)
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +
 +        if (do_f)
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +
 +        write_traj(fplog, cr, outf, mdof_flags,
 +                   top_global, step, (real)step, state, state, f, f, NULL, NULL);
 +
 +        /* Do the linesearching in the direction dx[point][0..(n-1)] */
 +
 +        /* pointer to current direction - point=0 first time here */
 +        s = dx[point];
 +
 +        /* calculate line gradient */
 +        for (gpa = 0, i = 0; i < n; i++)
 +        {
 +            gpa -= s[i]*ff[i];
 +        }
 +
 +        /* Calculate minimum allowed stepsize, before the average (norm)
 +         * relative change in coordinate is smaller than precision
 +         */
 +        for (minstep = 0, i = 0; i < n; i++)
 +        {
 +            tmp = fabs(xx[i]);
 +            if (tmp < 1.0)
 +            {
 +                tmp = 1.0;
 +            }
 +            tmp      = s[i]/tmp;
 +            minstep += tmp*tmp;
 +        }
 +        minstep = GMX_REAL_EPS/sqrt(minstep/n);
 +
 +        if (stepsize < minstep)
 +        {
 +            converged = TRUE;
 +            break;
 +        }
 +
 +        /* Store old forces and coordinates */
 +        for (i = 0; i < n; i++)
 +        {
 +            lastx[i] = xx[i];
 +            lastf[i] = ff[i];
 +        }
 +        Epot0 = Epot;
 +
 +        first = TRUE;
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            xa[i] = xx[i];
 +        }
 +
 +        /* Take a step downhill.
 +         * In theory, we should minimize the function along this direction.
 +         * That is quite possible, but it turns out to take 5-10 function evaluations
 +         * for each line. However, we dont really need to find the exact minimum -
 +         * it is much better to start a new BFGS step in a modified direction as soon
 +         * as we are close to it. This will save a lot of energy evaluations.
 +         *
 +         * In practice, we just try to take a single step.
 +         * If it worked (i.e. lowered the energy), we increase the stepsize but
 +         * the continue straight to the next BFGS step without trying to find any minimum.
 +         * If it didn't work (higher energy), there must be a minimum somewhere between
 +         * the old position and the new one.
 +         *
 +         * Due to the finite numerical accuracy, it turns out that it is a good idea
 +         * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +         * This leads to lower final energies in the tests I've done. / Erik
 +         */
 +        foundlower = FALSE;
 +        EpotA      = Epot0;
 +        a          = 0.0;
 +        c          = a + stepsize; /* reference position along line is zero */
 +
 +        /* Check stepsize first. We do not allow displacements
 +         * larger than emstep.
 +         */
 +        do
 +        {
 +            c        = a + stepsize;
 +            maxdelta = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                delta = c*s[i];
 +                if (delta > maxdelta)
 +                {
 +                    maxdelta = delta;
 +                }
 +            }
 +            if (maxdelta > inputrec->em_stepsize)
 +            {
 +                stepsize *= 0.1;
 +            }
 +        }
 +        while (maxdelta > inputrec->em_stepsize);
 +
 +        /* Take a trial step */
 +        for (i = 0; i < n; i++)
 +        {
 +            xc[i] = lastx[i] + c*s[i];
 +        }
 +
 +        neval++;
 +        /* Calculate energy for the trial step */
 +        ems.s.x = (rvec *)xc;
 +        ems.f   = (rvec *)fc;
 +        evaluate_energy(fplog, cr,
 +                        top_global, &ems, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, step, FALSE);
 +        EpotC = ems.epot;
 +
 +        /* Calc derivative along line */
 +        for (gpc = 0, i = 0; i < n; i++)
 +        {
 +            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
 +        }
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpc, cr);
 +        }
 +
 +        /* This is the max amount of increase in energy we tolerate */
 +        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
 +
 +        /* Accept the step if the energy is lower, or if it is not significantly higher
 +         * and the line derivative is still negative.
 +         */
 +        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
 +        {
 +            foundlower = TRUE;
 +            /* Great, we found a better energy. Increase step for next iteration
 +             * if we are still going down, decrease it otherwise
 +             */
 +            if (gpc < 0)
 +            {
 +                stepsize *= 1.618034; /* The golden section */
 +            }
 +            else
 +            {
 +                stepsize *= 0.618034; /* 1/golden section */
 +            }
 +        }
 +        else
 +        {
 +            /* New energy is the same or higher. We will have to do some work
 +             * to find a smaller value in the interval. Take smaller step next time!
 +             */
 +            foundlower = FALSE;
 +            stepsize  *= 0.618034;
 +        }
 +
 +        /* OK, if we didn't find a lower value we will have to locate one now - there must
 +         * be one in the interval [a=0,c].
 +         * The same thing is valid here, though: Don't spend dozens of iterations to find
 +         * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +         *
 +         * I also have a safeguard for potentially really patological functions so we never
 +         * take more than 20 steps before we give up ...
 +         *
 +         * If we already found a lower value we just skip this step and continue to the update.
 +         */
 +
 +        if (!foundlower)
 +        {
 +
 +            nminstep = 0;
 +            do
 +            {
 +                /* Select a new trial point.
 +                 * If the derivatives at points a & c have different sign we interpolate to zero,
 +                 * otherwise just do a bisection.
 +                 */
 +
 +                if (gpa < 0 && gpc > 0)
 +                {
 +                    b = a + gpa*(a-c)/(gpc-gpa);
 +                }
 +                else
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* safeguard if interpolation close to machine accuracy causes errors:
 +                 * never go outside the interval
 +                 */
 +                if (b <= a || b >= c)
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* Take a trial step */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xb[i] = lastx[i] + b*s[i];
 +                }
 +
 +                neval++;
 +                /* Calculate energy for the trial step */
 +                ems.s.x = (rvec *)xb;
 +                ems.f   = (rvec *)fb;
 +                evaluate_energy(fplog, cr,
 +                                top_global, &ems, top,
 +                                inputrec, nrnb, wcycle, gstat,
 +                                vsite, constr, fcd, graph, mdatoms, fr,
 +                                mu_tot, enerd, vir, pres, step, FALSE);
 +                EpotB = ems.epot;
 +
 +                fnorm = ems.fnorm;
 +
 +                for (gpb = 0, i = 0; i < n; i++)
 +                {
 +                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
 +
 +                }
 +                /* Sum the gradient along the line across CPUs */
 +                if (PAR(cr))
 +                {
 +                    gmx_sumd(1, &gpb, cr);
 +                }
 +
 +                /* Keep one of the intervals based on the value of the derivative at the new point */
 +                if (gpb > 0)
 +                {
 +                    /* Replace c endpoint with b */
 +                    EpotC = EpotB;
 +                    c     = b;
 +                    gpc   = gpb;
 +                    /* swap coord pointers b/c */
 +                    xtmp = xb;
 +                    ftmp = fb;
 +                    xb   = xc;
 +                    fb   = fc;
 +                    xc   = xtmp;
 +                    fc   = ftmp;
 +                }
 +                else
 +                {
 +                    /* Replace a endpoint with b */
 +                    EpotA = EpotB;
 +                    a     = b;
 +                    gpa   = gpb;
 +                    /* swap coord pointers a/b */
 +                    xtmp = xb;
 +                    ftmp = fb;
 +                    xb   = xa;
 +                    fb   = fa;
 +                    xa   = xtmp;
 +                    fa   = ftmp;
 +                }
 +
 +                /*
 +                 * Stop search as soon as we find a value smaller than the endpoints,
 +                 * or if the tolerance is below machine precision.
 +                 * Never run more than 20 steps, no matter what.
 +                 */
 +                nminstep++;
 +            }
 +            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
 +
 +            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
 +            {
 +                /* OK. We couldn't find a significantly lower energy.
 +                 * If ncorr==0 this was steepest descent, and then we give up.
 +                 * If not, reset memory to restart as steepest descent before quitting.
 +                 */
 +                if (ncorr == 0)
 +                {
 +                    /* Converged */
 +                    converged = TRUE;
 +                    break;
 +                }
 +                else
 +                {
 +                    /* Reset memory */
 +                    ncorr = 0;
 +                    /* Search in gradient direction */
 +                    for (i = 0; i < n; i++)
 +                    {
 +                        dx[point][i] = ff[i];
 +                    }
 +                    /* Reset stepsize */
 +                    stepsize = 1.0/fnorm;
 +                    continue;
 +                }
 +            }
 +
 +            /* Select min energy state of A & C, put the best in xx/ff/Epot
 +             */
 +            if (EpotC < EpotA)
 +            {
 +                Epot = EpotC;
 +                /* Use state C */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xx[i] = xc[i];
 +                    ff[i] = fc[i];
 +                }
 +                stepsize = c;
 +            }
 +            else
 +            {
 +                Epot = EpotA;
 +                /* Use state A */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xx[i] = xa[i];
 +                    ff[i] = fa[i];
 +                }
 +                stepsize = a;
 +            }
 +
 +        }
 +        else
 +        {
 +            /* found lower */
 +            Epot = EpotC;
 +            /* Use state C */
 +            for (i = 0; i < n; i++)
 +            {
 +                xx[i] = xc[i];
 +                ff[i] = fc[i];
 +            }
 +            stepsize = c;
 +        }
 +
 +        /* Update the memory information, and calculate a new
 +         * approximation of the inverse hessian
 +         */
 +
 +        /* Have new data in Epot, xx, ff */
 +        if (ncorr < nmaxcorr)
 +        {
 +            ncorr++;
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            dg[point][i]  = lastf[i]-ff[i];
 +            dx[point][i] *= stepsize;
 +        }
 +
 +        dgdg = 0;
 +        dgdx = 0;
 +        for (i = 0; i < n; i++)
 +        {
 +            dgdg += dg[point][i]*dg[point][i];
 +            dgdx += dg[point][i]*dx[point][i];
 +        }
 +
 +        diag = dgdx/dgdg;
 +
 +        rho[point] = 1.0/dgdx;
 +        point++;
 +
 +        if (point >= nmaxcorr)
 +        {
 +            point = 0;
 +        }
 +
 +        /* Update */
 +        for (i = 0; i < n; i++)
 +        {
 +            p[i] = ff[i];
 +        }
 +
 +        cp = point;
 +
 +        /* Recursive update. First go back over the memory points */
 +        for (k = 0; k < ncorr; k++)
 +        {
 +            cp--;
 +            if (cp < 0)
 +            {
 +                cp = ncorr-1;
 +            }
 +
 +            sq = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                sq += dx[cp][i]*p[i];
 +            }
 +
 +            alpha[cp] = rho[cp]*sq;
 +
 +            for (i = 0; i < n; i++)
 +            {
 +                p[i] -= alpha[cp]*dg[cp][i];
 +            }
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            p[i] *= diag;
 +        }
 +
 +        /* And then go forward again */
 +        for (k = 0; k < ncorr; k++)
 +        {
 +            yr = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                yr += p[i]*dg[cp][i];
 +            }
 +
 +            beta = rho[cp]*yr;
 +            beta = alpha[cp]-beta;
 +
 +            for (i = 0; i < n; i++)
 +            {
 +                p[i] += beta*dx[cp][i];
 +            }
 +
 +            cp++;
 +            if (cp >= ncorr)
 +            {
 +                cp = 0;
 +            }
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            if (!frozen[i])
 +            {
 +                dx[point][i] = p[i];
 +            }
 +            else
 +            {
 +                dx[point][i] = 0;
 +            }
 +        }
 +
 +        stepsize = 1.0;
 +
 +        /* Test whether the convergence criterion is met */
 +        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
 +
 +        /* Print it if necessary */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
 +            }
 +            /* Store the new (lower) energies */
 +            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
 +                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +            do_log = do_per_step(step, inputrec->nstlog);
 +            do_ene = do_per_step(step, inputrec->nstenergy);
 +            if (do_log)
 +            {
 +                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +            }
 +            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
 +                       do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +
 +        /* Stop when the maximum force lies below tolerance.
 +         * If we have reached machine precision, converged is already set to true.
 +         */
 +
 +        converged = converged || (fmax < inputrec->em_tol);
 +
 +    } /* End of the loop */
 +
 +    if (converged)
 +    {
 +        step--; /* we never took that last step in this case */
 +
 +    }
 +    if (fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
 +            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +    /* If we printed energy and/or logfile last step (which was the last step)
 +     * we don't have to do it again, but otherwise print the final values.
 +     */
 +    if (!do_log) /* Write final value to log since we didn't do anythin last step */
 +    {
 +        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +    }
 +    if (!do_ene || !do_log) /* Write final energy file entries */
 +    {
 +        print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
 +                   !do_log ? fplog : NULL, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +
 +    /* Print some stuff... */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +
 +    /* IMPORTANT!
 +     * For accurate normal mode calculation it is imperative that we
 +     * store the last conformation into the full precision binary trajectory.
 +     *
 +     * However, we should only do it if we did NOT already write this step
 +     * above (which we did if do_x or do_f was true).
 +     */
 +    do_x = !do_per_step(step, inputrec->nstxout);
 +    do_f = !do_per_step(step, inputrec->nstfout);
 +    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, step,
 +                  &ems, state, f);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
 +                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
 +        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
 +                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
 +
 +        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    runtime->nsteps_done = step;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_steep(FILE *fplog, t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
 +                int gmx_unused nstglobalcomm,
 +                gmx_vsite_t *vsite, gmx_constr_t constr,
 +                int gmx_unused stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global, t_fcdata *fcd,
 +                t_state *state_global,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                gmx_edsam_t gmx_unused  ed,
 +                t_forcerec *fr,
 +                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +                gmx_membed_t gmx_unused membed,
 +                real gmx_unused cpt_period, real gmx_unused max_hours,
 +                const char  gmx_unused *deviceOptions,
 +                unsigned long gmx_unused Flags,
 +                gmx_runtime_t *runtime)
 +{
 +    const char       *SD = "Steepest Descents";
 +    em_state_t       *s_min, *s_try;
 +    rvec             *f_global;
 +    gmx_localtop_t   *top;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f;
 +    gmx_global_stat_t gstat;
 +    t_graph          *graph;
 +    real              stepsize, constepsize;
 +    real              ustep, fnormn;
 +    gmx_mdoutf_t     *outf;
 +    t_mdebin         *mdebin;
 +    gmx_bool          bDone, bAbort, do_x, do_f;
 +    tensor            vir, pres;
 +    rvec              mu_tot;
 +    int               nsteps;
 +    int               count          = 0;
 +    int               steps_accepted = 0;
 +    /* not used */
 +    real              terminate = 0;
 +
 +    s_min = init_em_state();
 +    s_try = init_em_state();
 +
 +    /* Init em and store the local state in s_try */
 +    init_em(fplog, SD, cr, inputrec,
 +            state_global, top_global, s_try, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +
 +    /* Print to log file  */
 +    print_em_start(fplog, cr, runtime, wcycle, SD);
 +
 +    /* Set variables for stepsize (in nm). This is the largest
 +     * step that we are going to make in any direction.
 +     */
 +    ustep    = inputrec->em_stepsize;
 +    stepsize = 0;
 +
 +    /* Max number of steps  */
 +    nsteps = inputrec->nsteps;
 +
 +    if (MASTER(cr))
 +    {
 +        /* Print to the screen  */
 +        sp_header(stderr, SD, inputrec->em_tol, nsteps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, SD, inputrec->em_tol, nsteps);
 +    }
 +
 +    /**** HERE STARTS THE LOOP ****
 +     * count is the counter for the number of steps
 +     * bDone will be TRUE when the minimization has converged
 +     * bAbort will be TRUE when nsteps steps have been performed or when
 +     * the stepsize becomes smaller than is reasonable for machine precision
 +     */
 +    count  = 0;
 +    bDone  = FALSE;
 +    bAbort = FALSE;
 +    while (!bDone && !bAbort)
 +    {
 +        bAbort = (nsteps >= 0) && (count == nsteps);
 +
 +        /* set new coordinates, except for first step */
 +        if (count > 0)
 +        {
 +            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
 +                       s_min, stepsize, s_min->f, s_try,
 +                       constr, top, nrnb, wcycle, count);
 +        }
 +
 +        evaluate_energy(fplog, cr,
 +                        top_global, s_try, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, count, count == 0);
 +
 +        if (MASTER(cr))
 +        {
 +            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
 +        }
 +
 +        if (count == 0)
 +        {
 +            s_min->epot = s_try->epot + 1;
 +        }
 +
 +        /* Print it if necessary  */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
 +                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
 +                        (s_try->epot < s_min->epot) ? '\n' : '\r');
 +            }
 +
 +            if (s_try->epot < s_min->epot)
 +            {
 +                /* Store the new (lower) energies  */
 +                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
 +                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
 +                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +                print_ebin(outf->fp_ene, TRUE,
 +                           do_per_step(steps_accepted, inputrec->nstdisreout),
 +                           do_per_step(steps_accepted, inputrec->nstorireout),
 +                           fplog, count, count, eprNORMAL, TRUE,
 +                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +                fflush(fplog);
 +            }
 +        }
 +
 +        /* Now if the new energy is smaller than the previous...
 +         * or if this is the first step!
 +         * or if we did random steps!
 +         */
 +
 +        if ( (count == 0) || (s_try->epot < s_min->epot) )
 +        {
 +            steps_accepted++;
 +
 +            /* Test whether the convergence criterion is met...  */
 +            bDone = (s_try->fmax < inputrec->em_tol);
 +
 +            /* Copy the arrays for force, positions and energy  */
 +            /* The 'Min' array always holds the coords and forces of the minimal
 +               sampled energy  */
 +            swap_em_state(s_min, s_try);
 +            if (count > 0)
 +            {
 +                ustep *= 1.2;
 +            }
 +
 +            /* Write to trn, if necessary */
 +            do_x = do_per_step(steps_accepted, inputrec->nstxout);
 +            do_f = do_per_step(steps_accepted, inputrec->nstfout);
 +            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
 +                          top_global, inputrec, count,
 +                          s_min, state_global, f_global);
 +        }
 +        else
 +        {
 +            /* If energy is not smaller make the step smaller...  */
 +            ustep *= 0.5;
 +
 +            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
 +            {
 +                /* Reload the old state */
 +                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
 +                                       s_min, top, mdatoms, fr, vsite, constr,
 +                                       nrnb, wcycle);
 +            }
 +        }
 +
 +        /* Determine new step  */
 +        stepsize = ustep/s_min->fmax;
 +
 +        /* Check if stepsize is too small, with 1 nm as a characteristic length */
 +#ifdef GMX_DOUBLE
 +        if (count == nsteps || ustep < 1e-12)
 +#else
 +        if (count == nsteps || ustep < 1e-6)
 +#endif
 +        {
 +            if (MASTER(cr))
 +            {
 +                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
 +                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
 +            }
 +            bAbort = TRUE;
 +        }
 +
 +        count++;
 +    } /* End of the loop  */
 +
 +    /* Print some shit...  */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, count,
 +                  s_min, state_global, f_global);
 +
 +    fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    inputrec->nsteps = count;
 +
 +    runtime->nsteps_done = count;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_nm(FILE *fplog, t_commrec *cr,
 +             int nfile, const t_filenm fnm[],
 +             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused  bCompact,
 +             int gmx_unused nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int gmx_unused stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global, t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t  gmx_unused ed,
 +             t_forcerec *fr,
 +             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +             gmx_membed_t gmx_unused membed,
 +             real gmx_unused cpt_period, real gmx_unused max_hours,
 +             const char gmx_unused *deviceOptions,
 +             unsigned long gmx_unused Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char          *NM = "Normal Mode Analysis";
 +    gmx_mdoutf_t        *outf;
 +    int                  natoms, atom, d;
 +    int                  nnodes, node;
 +    rvec                *f_global;
 +    gmx_localtop_t      *top;
 +    gmx_enerdata_t      *enerd;
 +    rvec                *f;
 +    gmx_global_stat_t    gstat;
 +    t_graph             *graph;
 +    real                 t, t0, lambda, lam0;
 +    gmx_bool             bNS;
 +    tensor               vir, pres;
 +    rvec                 mu_tot;
 +    rvec                *fneg, *dfdx;
 +    gmx_bool             bSparse; /* use sparse matrix storage format */
 +    size_t               sz=0;
 +    gmx_sparsematrix_t * sparse_matrix           = NULL;
 +    real           *     full_matrix             = NULL;
 +    em_state_t       *   state_work;
 +
 +    /* added with respect to mdrun */
 +    int        i, j, k, row, col;
 +    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
 +    real       x_min;
 +    real       fnorm, fmax;
 +
 +    if (constr != NULL)
 +    {
 +        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
 +    }
 +
 +    state_work = init_em_state();
 +
 +    /* Init em and store the local state in state_minimum */
 +    init_em(fplog, NM, cr, inputrec,
 +            state_global, top_global, state_work, &top,
 +            &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, NULL);
 +
 +    natoms = top_global->natoms;
 +    snew(fneg, natoms);
 +    snew(dfdx, natoms);
 +
 +#ifndef GMX_DOUBLE
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,
 +                "NOTE: This version of Gromacs has been compiled in single precision,\n"
 +                "      which MIGHT not be accurate enough for normal mode analysis.\n"
 +                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
 +                "      are fairly modest even if you recompile in double precision.\n\n");
 +    }
 +#endif
 +
 +    /* Check if we can/should use sparse storage format.
 +     *
 +     * Sparse format is only useful when the Hessian itself is sparse, which it
 +     * will be when we use a cutoff.
 +     * For small systems (n<1000) it is easier to always use full matrix format, though.
 +     */
 +    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
 +    {
 +        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
 +        bSparse = FALSE;
 +    }
 +    else if (top_global->natoms < 1000)
 +    {
 +        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
 +        bSparse = FALSE;
 +    }
 +    else
 +    {
 +        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
 +        bSparse = TRUE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        sz = DIM*top_global->natoms;
 +
 +        fprintf(stderr, "Allocating Hessian memory...\n\n");
 +
 +        if (bSparse)
 +        {
 +            sparse_matrix = gmx_sparsematrix_init(sz);
 +            sparse_matrix->compressed_symmetric = TRUE;
 +        }
 +        else
 +        {
 +            snew(full_matrix, sz*sz);
 +        }
 +    }
 +
 +    /* Initial values */
 +    t0           = inputrec->init_t;
 +    lam0         = inputrec->fepvals->init_lambda;
 +    t            = t0;
 +    lambda       = lam0;
 +
 +    init_nrnb(nrnb);
 +
 +    where();
 +
 +    /* Write start time and temperature */
 +    print_em_start(fplog, cr, runtime, wcycle, NM);
 +
 +    /* fudge nr of steps to nr of atoms */
 +    inputrec->nsteps = natoms*2;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
 +                *(top_global->name), (int)inputrec->nsteps);
 +    }
 +
 +    nnodes = cr->nnodes;
 +
 +    /* Make evaluate_energy do a single node force calculation */
 +    cr->nnodes = 1;
 +    evaluate_energy(fplog, cr,
 +                    top_global, state_work, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    cr->nnodes = nnodes;
 +
 +    /* if forces are not small, warn user */
 +    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
 +
 +    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
 +    if (state_work->fmax > 1.0e-3)
 +    {
 +        md_print_info(cr, fplog,
 +                      "The force is probably not small enough to "
 +                      "ensure that you are at a minimum.\n"
 +                      "Be aware that negative eigenvalues may occur\n"
 +                      "when the resulting matrix is diagonalized.\n\n");
 +    }
 +
 +    /***********************************************************
 +     *
 +     *      Loop over all pairs in matrix
 +     *
 +     *      do_force called twice. Once with positive and
 +     *      once with negative displacement
 +     *
 +     ************************************************************/
 +
 +    /* Steps are divided one by one over the nodes */
 +    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
 +    {
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            x_min = state_work->s.x[atom][d];
 +
 +            state_work->s.x[atom][d] = x_min - der_range;
 +
 +            /* Make evaluate_energy do a single node force calculation */
 +            cr->nnodes = 1;
 +            evaluate_energy(fplog, cr,
 +                            top_global, state_work, top,
 +                            inputrec, nrnb, wcycle, gstat,
 +                            vsite, constr, fcd, graph, mdatoms, fr,
 +                            mu_tot, enerd, vir, pres, atom*2, FALSE);
 +
 +            for (i = 0; i < natoms; i++)
 +            {
 +                copy_rvec(state_work->f[i], fneg[i]);
 +            }
 +
 +            state_work->s.x[atom][d] = x_min + der_range;
 +
 +            evaluate_energy(fplog, cr,
 +                            top_global, state_work, top,
 +                            inputrec, nrnb, wcycle, gstat,
 +                            vsite, constr, fcd, graph, mdatoms, fr,
 +                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
 +            cr->nnodes = nnodes;
 +
 +            /* x is restored to original */
 +            state_work->s.x[atom][d] = x_min;
 +
 +            for (j = 0; j < natoms; j++)
 +            {
 +                for (k = 0; (k < DIM); k++)
 +                {
 +                    dfdx[j][k] =
 +                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
 +                }
 +            }
 +
 +            if (!MASTER(cr))
 +            {
 +#ifdef GMX_MPI
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
 +                         cr->mpi_comm_mygroup);
 +#endif
 +            }
 +            else
 +            {
 +                for (node = 0; (node < nnodes && atom+node < natoms); node++)
 +                {
 +                    if (node > 0)
 +                    {
 +#ifdef GMX_MPI
 +                        MPI_Status stat;
 +                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
 +                                 cr->mpi_comm_mygroup, &stat);
 +#undef mpi_type
 +#endif
 +                    }
 +
 +                    row = (atom + node)*DIM + d;
 +
 +                    for (j = 0; j < natoms; j++)
 +                    {
 +                        for (k = 0; k < DIM; k++)
 +                        {
 +                            col = j*DIM + k;
 +
 +                            if (bSparse)
 +                            {
 +                                if (col >= row && dfdx[j][k] != 0.0)
 +                                {
 +                                    gmx_sparsematrix_increment_value(sparse_matrix,
 +                                                                     row, col, dfdx[j][k]);
 +                                }
 +                            }
 +                            else
 +                            {
 +                                full_matrix[row*sz+col] = dfdx[j][k];
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            if (bVerbose && fplog)
 +            {
 +                fflush(fplog);
 +            }
 +        }
 +        /* write progress */
 +        if (MASTER(cr) && bVerbose)
 +        {
 +            fprintf(stderr, "\rFinished step %d out of %d",
 +                    min(atom+nnodes, natoms), natoms);
 +            fflush(stderr);
 +        }
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n\nWriting Hessian...\n");
 +        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    runtime->nsteps_done = natoms*2;
 +
 +    return 0;
 +}
diff --cc src/gromacs/mdlib/pme.c
index 0f804b37db,0000000000..503e1b449e
mode 100644,000000..100644
--- a/src/gromacs/mdlib/pme.c
+++ b/src/gromacs/mdlib/pme.c
@@@ -1,4626 -1,0 +1,4632 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* IMPORTANT FOR DEVELOPERS:
 + *
 + * Triclinic pme stuff isn't entirely trivial, and we've experienced
 + * some bugs during development (many of them due to me). To avoid
 + * this in the future, please check the following things if you make
 + * changes in this file:
 + *
 + * 1. You should obtain identical (at least to the PME precision)
 + *    energies, forces, and virial for
 + *    a rectangular box and a triclinic one where the z (or y) axis is
 + *    tilted a whole box side. For instance you could use these boxes:
 + *
 + *    rectangular       triclinic
 + *     2  0  0           2  0  0
 + *     0  2  0           0  2  0
 + *     0  0  6           2  2  6
 + *
 + * 2. You should check the energy conservation in a triclinic box.
 + *
 + * It might seem an overkill, but better safe than sorry.
 + * /Erik 001109
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "gromacs/fft/parallel_3dfft.h"
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include <assert.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "gmxcomplex.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "coulomb.h"
 +#include "gmx_fatal.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "physics.h"
 +#include "nrnb.h"
 +#include "gmx_wallcycle.h"
 +#include "pdbio.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmx_omp.h"
 +#include "macros.h"
 +
 +
 +/* Include the SIMD macro file and then check for support */
 +#include "gmx_simd_macros.h"
 +#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
 +/* Turn on SIMD intrinsics for PME solve */
 +#define PME_SIMD
 +#endif
 +
 +/* SIMD spread+gather only in single precision with SSE2 or higher available.
 + * We might want to switch to use gmx_simd_macros.h, but this is somewhat
 + * complicated, as we use unaligned and/or 4-wide only loads.
 + */
 +#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
 +#define PME_SSE_SPREAD_GATHER
 +#include <emmintrin.h>
 +/* Some old AMD processors could have problems with unaligned loads+stores */
 +#ifndef GMX_FAHCORE
 +#define PME_SSE_UNALIGNED
 +#endif
 +#endif
 +
 +#define DFT_TOL 1e-7
 +/* #define PRT_FORCE */
 +/* conditions for on the fly time-measurement */
 +/* #define TAKETIME (step > 1 && timesteps < 10) */
 +#define TAKETIME FALSE
 +
 +/* #define PME_TIME_THREADS */
 +
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +
 +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 +#define GMX_CACHE_SEP 64
 +
 +/* We only define a maximum to be able to use local arrays without allocation.
 + * An order larger than 12 should never be needed, even for test cases.
 + * If needed it can be changed here.
 + */
 +#define PME_ORDER_MAX 12
 +
 +/* Internal datastructures */
 +typedef struct {
 +    int send_index0;
 +    int send_nindex;
 +    int recv_index0;
 +    int recv_nindex;
 +    int recv_size;   /* Receive buffer width, used with OpenMP */
 +} pme_grid_comm_t;
 +
 +typedef struct {
 +#ifdef GMX_MPI
 +    MPI_Comm         mpi_comm;
 +#endif
 +    int              nnodes, nodeid;
 +    int             *s2g0;
 +    int             *s2g1;
 +    int              noverlap_nodes;
 +    int             *send_id, *recv_id;
 +    int              send_size; /* Send buffer width, used with OpenMP */
 +    pme_grid_comm_t *comm_data;
 +    real            *sendbuf;
 +    real            *recvbuf;
 +} pme_overlap_t;
 +
 +typedef struct {
 +    int *n;      /* Cumulative counts of the number of particles per thread */
 +    int  nalloc; /* Allocation size of i */
 +    int *i;      /* Particle indices ordered on thread index (n) */
 +} thread_plist_t;
 +
 +typedef struct {
 +    int      *thread_one;
 +    int       n;
 +    int      *ind;
 +    splinevec theta;
 +    real     *ptr_theta_z;
 +    splinevec dtheta;
 +    real     *ptr_dtheta_z;
 +} splinedata_t;
 +
 +typedef struct {
 +    int      dimind;        /* The index of the dimension, 0=x, 1=y */
 +    int      nslab;
 +    int      nodeid;
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +
 +    int     *node_dest;     /* The nodes to send x and q to with DD */
 +    int     *node_src;      /* The nodes to receive x and q from with DD */
 +    int     *buf_index;     /* Index for commnode into the buffers */
 +
 +    int      maxshift;
 +
 +    int      npd;
 +    int      pd_nalloc;
 +    int     *pd;
 +    int     *count;         /* The number of atoms to send to each node */
 +    int    **count_thread;
 +    int     *rcount;        /* The number of atoms to receive */
 +
 +    int      n;
 +    int      nalloc;
 +    rvec    *x;
 +    real    *q;
 +    rvec    *f;
 +    gmx_bool bSpread;       /* These coordinates are used for spreading */
 +    int      pme_order;
 +    ivec    *idx;
 +    rvec    *fractx;            /* Fractional coordinate relative to the
 +                                 * lower cell boundary
 +                                 */
 +    int             nthread;
 +    int            *thread_idx; /* Which thread should spread which charge */
 +    thread_plist_t *thread_plist;
 +    splinedata_t   *spline;
 +} pme_atomcomm_t;
 +
 +#define FLBS  3
 +#define FLBSZ 4
 +
 +typedef struct {
 +    ivec  ci;     /* The spatial location of this grid         */
 +    ivec  n;      /* The used size of *grid, including order-1 */
 +    ivec  offset; /* The grid offset from the full node grid   */
 +    int   order;  /* PME spreading order                       */
 +    ivec  s;      /* The allocated size of *grid, s >= n       */
 +    real *grid;   /* The grid local thread, size n             */
 +} pmegrid_t;
 +
 +typedef struct {
 +    pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 +    int        nthread;      /* The number of threads operating on this grid     */
 +    ivec       nc;           /* The local spatial decomposition over the threads */
 +    pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 +    real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 +    int      **g2t;          /* The grid to thread index                         */
 +    ivec       nthread_comm; /* The number of threads to communicate with        */
 +} pmegrids_t;
 +
 +
 +typedef struct {
 +#ifdef PME_SSE_SPREAD_GATHER
 +    /* Masks for SSE aligned spreading and gathering */
 +    __m128 mask_SSE0[6], mask_SSE1[6];
 +#else
 +    int    dummy; /* C89 requires that struct has at least one member */
 +#endif
 +} pme_spline_work_t;
 +
 +typedef struct {
 +    /* work data for solve_pme */
 +    int      nalloc;
 +    real *   mhx;
 +    real *   mhy;
 +    real *   mhz;
 +    real *   m2;
 +    real *   denom;
 +    real *   tmp1_alloc;
 +    real *   tmp1;
 +    real *   eterm;
 +    real *   m2inv;
 +
 +    real     energy;
 +    matrix   vir;
 +} pme_work_t;
 +
 +typedef struct gmx_pme {
 +    int           ndecompdim; /* The number of decomposition dimensions */
 +    int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 +    int           nodeid_major;
 +    int           nodeid_minor;
 +    int           nnodes;    /* The number of nodes doing PME */
 +    int           nnodes_major;
 +    int           nnodes_minor;
 +
 +    MPI_Comm      mpi_comm;
 +    MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 +#ifdef GMX_MPI
 +    MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 +#endif
 +
 +    gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
 +    int        nthread;       /* The number of threads doing PME on our rank */
 +
 +    gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 +    gmx_bool   bFEP;          /* Compute Free energy contribution */
 +    int        nkx, nky, nkz; /* Grid dimensions */
 +    gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 +    int        pme_order;
 +    real       epsilon_r;
 +
 +    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 +    pmegrids_t pmegridB;
 +    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 +    int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 +    /* pmegrid_nz might be larger than strictly necessary to ensure
 +     * memory alignment, pmegrid_nz_base gives the real base size.
 +     */
 +    int     pmegrid_nz_base;
 +    /* The local PME grid starting indices */
 +    int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 +
 +    /* Work data for spreading and gathering */
 +    pme_spline_work_t    *spline_work;
 +
 +    real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 +    real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
 +    int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
 +
 +    t_complex            *cfftgridA;  /* Grids for complex FFT data */
 +    t_complex            *cfftgridB;
 +    int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 +
 +    gmx_parallel_3dfft_t  pfft_setupA;
 +    gmx_parallel_3dfft_t  pfft_setupB;
 +
 +    int                  *nnx, *nny, *nnz;
 +    real                 *fshx, *fshy, *fshz;
 +
 +    pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
 +    matrix                recipbox;
 +    splinevec             bsp_mod;
 +
 +    pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 +
 +    pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
 +
 +    rvec                 *bufv;       /* Communication buffer */
 +    real                 *bufr;       /* Communication buffer */
 +    int                   buf_nalloc; /* The communication buffer size */
 +
 +    /* thread local work data for solve_pme */
 +    pme_work_t *work;
 +
 +    /* Work data for PME_redist */
 +    gmx_bool redist_init;
 +    int *    scounts;
 +    int *    rcounts;
 +    int *    sdispls;
 +    int *    rdispls;
 +    int *    sidx;
 +    int *    idxa;
 +    real *   redist_buf;
 +    int      redist_buf_nalloc;
 +
 +    /* Work data for sum_qgrid */
 +    real *   sum_qgrid_tmp;
 +    real *   sum_qgrid_dd_tmp;
 +} t_gmx_pme;
 +
 +
 +static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                                   int start, int end, int thread)
 +{
 +    int             i;
 +    int            *idxptr, tix, tiy, tiz;
 +    real           *xptr, *fptr, tx, ty, tz;
 +    real            rxx, ryx, ryy, rzx, rzy, rzz;
 +    int             nx, ny, nz;
 +    int             start_ix, start_iy, start_iz;
 +    int            *g2tx, *g2ty, *g2tz;
 +    gmx_bool        bThreads;
 +    int            *thread_idx = NULL;
 +    thread_plist_t *tpl        = NULL;
 +    int            *tpl_n      = NULL;
 +    int             thread_i;
 +
 +    nx  = pme->nkx;
 +    ny  = pme->nky;
 +    nz  = pme->nkz;
 +
 +    start_ix = pme->pmegrid_start_ix;
 +    start_iy = pme->pmegrid_start_iy;
 +    start_iz = pme->pmegrid_start_iz;
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    g2tx = pme->pmegridA.g2t[XX];
 +    g2ty = pme->pmegridA.g2t[YY];
 +    g2tz = pme->pmegridA.g2t[ZZ];
 +
 +    bThreads = (atc->nthread > 1);
 +    if (bThreads)
 +    {
 +        thread_idx = atc->thread_idx;
 +
 +        tpl   = &atc->thread_plist[thread];
 +        tpl_n = tpl->n;
 +        for (i = 0; i < atc->nthread; i++)
 +        {
 +            tpl_n[i] = 0;
 +        }
 +    }
 +
 +    for (i = start; i < end; i++)
 +    {
 +        xptr   = atc->x[i];
 +        idxptr = atc->idx[i];
 +        fptr   = atc->fractx[i];
 +
 +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 +
 +        tix = (int)(tx);
 +        tiy = (int)(ty);
 +        tiz = (int)(tz);
 +
 +        /* Because decomposition only occurs in x and y,
 +         * we never have a fraction correction in z.
 +         */
 +        fptr[XX] = tx - tix + pme->fshx[tix];
 +        fptr[YY] = ty - tiy + pme->fshy[tiy];
 +        fptr[ZZ] = tz - tiz;
 +
 +        idxptr[XX] = pme->nnx[tix];
 +        idxptr[YY] = pme->nny[tiy];
 +        idxptr[ZZ] = pme->nnz[tiz];
 +
 +#ifdef DEBUG
 +        range_check(idxptr[XX], 0, pme->pmegrid_nx);
 +        range_check(idxptr[YY], 0, pme->pmegrid_ny);
 +        range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 +#endif
 +
 +        if (bThreads)
 +        {
 +            thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 +            thread_idx[i] = thread_i;
 +            tpl_n[thread_i]++;
 +        }
 +    }
 +
 +    if (bThreads)
 +    {
 +        /* Make a list of particle indices sorted on thread */
 +
 +        /* Get the cumulative count */
 +        for (i = 1; i < atc->nthread; i++)
 +        {
 +            tpl_n[i] += tpl_n[i-1];
 +        }
 +        /* The current implementation distributes particles equally
 +         * over the threads, so we could actually allocate for that
 +         * in pme_realloc_atomcomm_things.
 +         */
 +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
 +        {
 +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 +            srenew(tpl->i, tpl->nalloc);
 +        }
 +        /* Set tpl_n to the cumulative start */
 +        for (i = atc->nthread-1; i >= 1; i--)
 +        {
 +            tpl_n[i] = tpl_n[i-1];
 +        }
 +        tpl_n[0] = 0;
 +
 +        /* Fill our thread local array with indices sorted on thread */
 +        for (i = start; i < end; i++)
 +        {
 +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 +        }
 +        /* Now tpl_n contains the cummulative count again */
 +    }
 +}
 +
 +static void make_thread_local_ind(pme_atomcomm_t *atc,
 +                                  int thread, splinedata_t *spline)
 +{
 +    int             n, t, i, start, end;
 +    thread_plist_t *tpl;
 +
 +    /* Combine the indices made by each thread into one index */
 +
 +    n     = 0;
 +    start = 0;
 +    for (t = 0; t < atc->nthread; t++)
 +    {
 +        tpl = &atc->thread_plist[t];
 +        /* Copy our part (start - end) from the list of thread t */
 +        if (thread > 0)
 +        {
 +            start = tpl->n[thread-1];
 +        }
 +        end = tpl->n[thread];
 +        for (i = start; i < end; i++)
 +        {
 +            spline->ind[n++] = tpl->i[i];
 +        }
 +    }
 +
 +    spline->n = n;
 +}
 +
 +
 +static void pme_calc_pidx(int start, int end,
 +                          matrix recipbox, rvec x[],
 +                          pme_atomcomm_t *atc, int *count)
 +{
 +    int   nslab, i;
 +    int   si;
 +    real *xptr, s;
 +    real  rxx, ryx, rzx, ryy, rzy;
 +    int  *pd;
 +
 +    /* Calculate PME task index (pidx) for each grid index.
 +     * Here we always assign equally sized slabs to each node
 +     * for load balancing reasons (the PME grid spacing is not used).
 +     */
 +
 +    nslab = atc->nslab;
 +    pd    = atc->pd;
 +
 +    /* Reset the count */
 +    for (i = 0; i < nslab; i++)
 +    {
 +        count[i] = 0;
 +    }
 +
 +    if (atc->dimind == 0)
 +    {
 +        rxx = recipbox[XX][XX];
 +        ryx = recipbox[YY][XX];
 +        rzx = recipbox[ZZ][XX];
 +        /* Calculate the node index in x-dimension */
 +        for (i = start; i < end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 +            si    = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +    else
 +    {
 +        ryy = recipbox[YY][YY];
 +        rzy = recipbox[ZZ][YY];
 +        /* Calculate the node index in y-dimension */
 +        for (i = start; i < end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 +            si    = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +}
 +
 +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 +                                  pme_atomcomm_t *atc)
 +{
 +    int nthread, thread, slab;
 +
 +    nthread = atc->nthread;
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        pme_calc_pidx(natoms* thread   /nthread,
 +                      natoms*(thread+1)/nthread,
 +                      recipbox, x, atc, atc->count_thread[thread]);
 +    }
 +    /* Non-parallel reduction, since nslab is small */
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        for (slab = 0; slab < atc->nslab; slab++)
 +        {
 +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 +        }
 +    }
 +}
 +
 +static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 +{
 +    const int padding = 4;
 +    int       i;
 +
 +    srenew(th[XX], nalloc);
 +    srenew(th[YY], nalloc);
 +    /* In z we add padding, this is only required for the aligned SSE code */
 +    srenew(*ptr_z, nalloc+2*padding);
 +    th[ZZ] = *ptr_z + padding;
 +
 +    for (i = 0; i < padding; i++)
 +    {
 +        (*ptr_z)[               i] = 0;
 +        (*ptr_z)[padding+nalloc+i] = 0;
 +    }
 +}
 +
 +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 +{
 +    int i, d;
 +
 +    srenew(spline->ind, atc->nalloc);
 +    /* Initialize the index to identity so it works without threads */
 +    for (i = 0; i < atc->nalloc; i++)
 +    {
 +        spline->ind[i] = i;
 +    }
 +
 +    realloc_splinevec(spline->theta, &spline->ptr_theta_z,
 +                      atc->pme_order*atc->nalloc);
 +    realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
 +                      atc->pme_order*atc->nalloc);
 +}
 +
 +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 +{
 +    int nalloc_old, i, j, nalloc_tpl;
 +
 +    /* We have to avoid a NULL pointer for atc->x to avoid
 +     * possible fatal errors in MPI routines.
 +     */
 +    if (atc->n > atc->nalloc || atc->nalloc == 0)
 +    {
 +        nalloc_old  = atc->nalloc;
 +        atc->nalloc = over_alloc_dd(max(atc->n, 1));
 +
 +        if (atc->nslab > 1)
 +        {
 +            srenew(atc->x, atc->nalloc);
 +            srenew(atc->q, atc->nalloc);
 +            srenew(atc->f, atc->nalloc);
 +            for (i = nalloc_old; i < atc->nalloc; i++)
 +            {
 +                clear_rvec(atc->f[i]);
 +            }
 +        }
 +        if (atc->bSpread)
 +        {
 +            srenew(atc->fractx, atc->nalloc);
 +            srenew(atc->idx, atc->nalloc);
 +
 +            if (atc->nthread > 1)
 +            {
 +                srenew(atc->thread_idx, atc->nalloc);
 +            }
 +
 +            for (i = 0; i < atc->nthread; i++)
 +            {
 +                pme_realloc_splinedata(&atc->spline[i], atc);
 +            }
 +        }
 +    }
 +}
 +
 +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 +                         int n, gmx_bool bXF, rvec *x_f, real *charge,
 +                         pme_atomcomm_t *atc)
 +/* Redistribute particle data for PME calculation */
 +/* domain decomposition by x coordinate           */
 +{
 +    int *idxa;
 +    int  i, ii;
 +
 +    if (FALSE == pme->redist_init)
 +    {
 +        snew(pme->scounts, atc->nslab);
 +        snew(pme->rcounts, atc->nslab);
 +        snew(pme->sdispls, atc->nslab);
 +        snew(pme->rdispls, atc->nslab);
 +        snew(pme->sidx, atc->nslab);
 +        pme->redist_init = TRUE;
 +    }
 +    if (n > pme->redist_buf_nalloc)
 +    {
 +        pme->redist_buf_nalloc = over_alloc_dd(n);
 +        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
 +    }
 +
 +    pme->idxa = atc->pd;
 +
 +#ifdef GMX_MPI
 +    if (forw && bXF)
 +    {
 +        /* forward, redistribution from pp to pme */
 +
 +        /* Calculate send counts and exchange them with other nodes */
 +        for (i = 0; (i < atc->nslab); i++)
 +        {
 +            pme->scounts[i] = 0;
 +        }
 +        for (i = 0; (i < n); i++)
 +        {
 +            pme->scounts[pme->idxa[i]]++;
 +        }
 +        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 +
 +        /* Calculate send and receive displacements and index into send
 +           buffer */
 +        pme->sdispls[0] = 0;
 +        pme->rdispls[0] = 0;
 +        pme->sidx[0]    = 0;
 +        for (i = 1; i < atc->nslab; i++)
 +        {
 +            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
 +            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
 +            pme->sidx[i]    = pme->sdispls[i];
 +        }
 +        /* Total # of particles to be received */
 +        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 +
 +        pme_realloc_atomcomm_things(atc);
 +
 +        /* Copy particle coordinates into send buffer and exchange*/
 +        for (i = 0; (i < n); i++)
 +        {
 +            ii = DIM*pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii+XX] = x_f[i][XX];
 +            pme->redist_buf[ii+YY] = x_f[i][YY];
 +            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +    }
 +    if (forw)
 +    {
 +        /* Copy charge into send buffer and exchange*/
 +        for (i = 0; i < atc->nslab; i++)
 +        {
 +            pme->sidx[i] = pme->sdispls[i];
 +        }
 +        for (i = 0; (i < n); i++)
 +        {
 +            ii = pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii] = charge[i];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 +                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
 +                      atc->mpi_comm);
 +    }
 +    else   /* backward, redistribution from pme to pp */
 +    {
 +        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 +                      pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +
 +        /* Copy data from receive buffer */
 +        for (i = 0; i < atc->nslab; i++)
 +        {
 +            pme->sidx[i] = pme->sdispls[i];
 +        }
 +        for (i = 0; (i < n); i++)
 +        {
 +            ii          = DIM*pme->sidx[pme->idxa[i]];
 +            x_f[i][XX] += pme->redist_buf[ii+XX];
 +            x_f[i][YY] += pme->redist_buf[ii+YY];
 +            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 +            pme->sidx[pme->idxa[i]]++;
 +        }
 +    }
 +#endif
 +}
 +
 +static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 +                            gmx_bool bBackward, int shift,
 +                            void *buf_s, int nbyte_s,
 +                            void *buf_r, int nbyte_r)
 +{
 +#ifdef GMX_MPI
 +    int        dest, src;
 +    MPI_Status stat;
 +
 +    if (bBackward == FALSE)
 +    {
 +        dest = atc->node_dest[shift];
 +        src  = atc->node_src[shift];
 +    }
 +    else
 +    {
 +        dest = atc->node_src[shift];
 +        src  = atc->node_dest[shift];
 +    }
 +
 +    if (nbyte_s > 0 && nbyte_r > 0)
 +    {
 +        MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
 +                     dest, shift,
 +                     buf_r, nbyte_r, MPI_BYTE,
 +                     src, shift,
 +                     atc->mpi_comm, &stat);
 +    }
 +    else if (nbyte_s > 0)
 +    {
 +        MPI_Send(buf_s, nbyte_s, MPI_BYTE,
 +                 dest, shift,
 +                 atc->mpi_comm);
 +    }
 +    else if (nbyte_r > 0)
 +    {
 +        MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
 +                 src, shift,
 +                 atc->mpi_comm, &stat);
 +    }
 +#endif
 +}
 +
 +static void dd_pmeredist_x_q(gmx_pme_t pme,
 +                             int n, gmx_bool bX, rvec *x, real *charge,
 +                             pme_atomcomm_t *atc)
 +{
 +    int *commnode, *buf_index;
 +    int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 +
 +    nsend = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        buf_index[commnode[i]] = nsend;
 +        nsend                 += atc->count[commnode[i]];
 +    }
 +    if (bX)
 +    {
 +        if (atc->count[atc->nodeid] + nsend != n)
 +        {
 +            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 +                      "This usually means that your system is not well equilibrated.",
 +                      n - (atc->count[atc->nodeid] + nsend),
 +                      pme->nodeid, 'x'+atc->dimind);
 +        }
 +
 +        if (nsend > pme->buf_nalloc)
 +        {
 +            pme->buf_nalloc = over_alloc_dd(nsend);
 +            srenew(pme->bufv, pme->buf_nalloc);
 +            srenew(pme->bufr, pme->buf_nalloc);
 +        }
 +
 +        atc->n = atc->count[atc->nodeid];
 +        for (i = 0; i < nnodes_comm; i++)
 +        {
 +            scount = atc->count[commnode[i]];
 +            /* Communicate the count */
 +            if (debug)
 +            {
 +                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
 +                        atc->dimind, atc->nodeid, commnode[i], scount);
 +            }
 +            pme_dd_sendrecv(atc, FALSE, i,
 +                            &scount, sizeof(int),
 +                            &atc->rcount[i], sizeof(int));
 +            atc->n += atc->rcount[i];
 +        }
 +
 +        pme_realloc_atomcomm_things(atc);
 +    }
 +
 +    local_pos = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        node = atc->pd[i];
 +        if (node == atc->nodeid)
 +        {
 +            /* Copy direct to the receive buffer */
 +            if (bX)
 +            {
 +                copy_rvec(x[i], atc->x[local_pos]);
 +            }
 +            atc->q[local_pos] = charge[i];
 +            local_pos++;
 +        }
 +        else
 +        {
 +            /* Copy to the send buffer */
 +            if (bX)
 +            {
 +                copy_rvec(x[i], pme->bufv[buf_index[node]]);
 +            }
 +            pme->bufr[buf_index[node]] = charge[i];
 +            buf_index[node]++;
 +        }
 +    }
 +
 +    buf_pos = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        scount = atc->count[commnode[i]];
 +        rcount = atc->rcount[i];
 +        if (scount > 0 || rcount > 0)
 +        {
 +            if (bX)
 +            {
 +                /* Communicate the coordinates */
 +                pme_dd_sendrecv(atc, FALSE, i,
 +                                pme->bufv[buf_pos], scount*sizeof(rvec),
 +                                atc->x[local_pos], rcount*sizeof(rvec));
 +            }
 +            /* Communicate the charges */
 +            pme_dd_sendrecv(atc, FALSE, i,
 +                            pme->bufr+buf_pos, scount*sizeof(real),
 +                            atc->q+local_pos, rcount*sizeof(real));
 +            buf_pos   += scount;
 +            local_pos += atc->rcount[i];
 +        }
 +    }
 +}
 +
 +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                           int n, rvec *f,
 +                           gmx_bool bAddF)
 +{
 +    int *commnode, *buf_index;
 +    int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 +
 +    local_pos = atc->count[atc->nodeid];
 +    buf_pos   = 0;
 +    for (i = 0; i < nnodes_comm; i++)
 +    {
 +        scount = atc->rcount[i];
 +        rcount = atc->count[commnode[i]];
 +        if (scount > 0 || rcount > 0)
 +        {
 +            /* Communicate the forces */
 +            pme_dd_sendrecv(atc, TRUE, i,
 +                            atc->f[local_pos], scount*sizeof(rvec),
 +                            pme->bufv[buf_pos], rcount*sizeof(rvec));
 +            local_pos += scount;
 +        }
 +        buf_index[commnode[i]] = buf_pos;
 +        buf_pos               += rcount;
 +    }
 +
 +    local_pos = 0;
 +    if (bAddF)
 +    {
 +        for (i = 0; i < n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Add from the local force array */
 +                rvec_inc(f[i], atc->f[local_pos]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Add from the receive buffer */
 +                rvec_inc(f[i], pme->bufv[buf_index[node]]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Copy from the local force array */
 +                copy_rvec(atc->f[local_pos], f[i]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Copy from the receive buffer */
 +                copy_rvec(pme->bufv[buf_index[node]], f[i]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void
 +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 +{
 +    pme_overlap_t *overlap;
 +    int            send_index0, send_nindex;
 +    int            recv_index0, recv_nindex;
 +    MPI_Status     stat;
 +    int            i, j, k, ix, iy, iz, icnt;
 +    int            ipulse, send_id, recv_id, datasize;
 +    real          *p;
 +    real          *sendptr, *recvptr;
 +
 +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 +    overlap = &pme->overlap[1];
 +
 +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +    {
 +        /* Since we have already (un)wrapped the overlap in the z-dimension,
 +         * we only have to communicate 0 to nkz (not pmegrid_nz).
 +         */
 +        if (direction == GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +        }
 +        else
 +        {
 +            send_id       = overlap->recv_id[ipulse];
 +            recv_id       = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        }
 +
 +        /* Copy data to contiguous send buffer */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, send_id,
 +                    pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy+send_nindex);
 +        }
 +        icnt = 0;
 +        for (i = 0; i < pme->pmegrid_nx; i++)
 +        {
 +            ix = i;
 +            for (j = 0; j < send_nindex; j++)
 +            {
 +                iy = j + send_index0 - pme->pmegrid_start_iy;
 +                for (k = 0; k < pme->nkz; k++)
 +                {
 +                    iz = k;
 +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                }
 +            }
 +        }
 +
 +        datasize      = pme->pmegrid_nx * pme->nkz;
 +
 +        MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +
 +        /* Get data from contiguous recv buffer */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, recv_id,
 +                    pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
 +        }
 +        icnt = 0;
 +        for (i = 0; i < pme->pmegrid_nx; i++)
 +        {
 +            ix = i;
 +            for (j = 0; j < recv_nindex; j++)
 +            {
 +                iy = j + recv_index0 - pme->pmegrid_start_iy;
 +                for (k = 0; k < pme->nkz; k++)
 +                {
 +                    iz = k;
 +                    if (direction == GMX_SUM_QGRID_FORWARD)
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 +                    }
 +                    else
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Major dimension is easier, no copying required,
 +     * but we might have to sum to separate array.
 +     * Since we don't copy, we have to communicate up to pmegrid_nz,
 +     * not nkz as for the minor direction.
 +     */
 +    overlap = &pme->overlap[0];
 +
 +    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +    {
 +        if (direction == GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recvptr       = overlap->recvbuf;
 +        }
 +        else
 +        {
 +            send_id       = overlap->recv_id[ipulse];
 +            recv_id       = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        }
 +
 +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, send_id,
 +                    pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix+send_nindex);
 +            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid, overlap->nodeid, recv_id,
 +                    pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
 +        }
 +
 +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +
 +        /* ADD data from contiguous recv buffer */
 +        if (direction == GMX_SUM_QGRID_FORWARD)
 +        {
 +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +            for (i = 0; i < recv_nindex*datasize; i++)
 +            {
 +                p[i] += overlap->recvbuf[i];
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +
 +static int
 +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 +{
 +    ivec    local_fft_ndata, local_fft_offset, local_fft_size;
 +    ivec    local_pme_size;
 +    int     i, ix, iy, iz;
 +    int     pmeidx, fftidx;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +       the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    {
 +#ifdef DEBUG_PME
 +        FILE *fp, *fp2;
 +        char  fn[STRLEN], format[STRLEN];
 +        real  val;
 +        sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
 +        fp = ffopen(fn, "w");
 +        sprintf(fn, "pmegrid%d.txt", pme->nodeid);
 +        fp2 = ffopen(fn, "w");
 +        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
 +#endif
 +
 +        for (ix = 0; ix < local_fft_ndata[XX]; ix++)
 +        {
 +            for (iy = 0; iy < local_fft_ndata[YY]; iy++)
 +            {
 +                for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 +                {
 +                    pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 +                    fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 +                    fftgrid[fftidx] = pmegrid[pmeidx];
 +#ifdef DEBUG_PME
 +                    val = 100*pmegrid[pmeidx];
 +                    if (pmegrid[pmeidx] != 0)
 +                    {
 +                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
 +                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
 +                    }
 +                    if (pmegrid[pmeidx] != 0)
 +                    {
 +                        fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
 +                                "qgrid",
 +                                pme->pmegrid_start_ix + ix,
 +                                pme->pmegrid_start_iy + iy,
 +                                pme->pmegrid_start_iz + iz,
 +                                pmegrid[pmeidx]);
 +                    }
 +#endif
 +                }
 +            }
 +        }
 +#ifdef DEBUG_PME
 +        ffclose(fp);
 +        ffclose(fp2);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +
 +static gmx_cycles_t omp_cyc_start()
 +{
 +    return gmx_cycles_read();
 +}
 +
 +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 +{
 +    return gmx_cycles_read() - c;
 +}
 +
 +
 +static int
 +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 +                        int nthread, int thread)
 +{
 +    ivec          local_fft_ndata, local_fft_offset, local_fft_size;
 +    ivec          local_pme_size;
 +    int           ixy0, ixy1, ixy, ix, iy, iz;
 +    int           pmeidx, fftidx;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t  c1;
 +    static double cs1 = 0;
 +    static int    cnt = 0;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +       the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +
 +    for (ixy = ixy0; ixy < ixy1; ixy++)
 +    {
 +        ix = ixy/local_fft_ndata[YY];
 +        iy = ixy - ix*local_fft_ndata[YY];
 +
 +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 +        for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
 +        {
 +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    c1   = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("copy %.2f\n", cs1*1e-9);
 +    }
 +#endif
 +
 +    return 0;
 +}
 +
 +
 +static void
 +wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    /* Add periodic overlap in z */
 +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +    {
 +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
 +        {
 +            for (iz = 0; iz < overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +        {
 +            for (iy = 0; iy < overlap; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[(ix*pny+ny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for (ix = 0; ix < overlap; ix++)
 +        {
 +            for (iy = 0; iy < ny_x; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void
 +unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for (ix = 0; ix < overlap; ix++)
 +        {
 +            int iy, iz;
 +
 +            for (iy = 0; iy < ny_x; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +        for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +        {
 +            int iy, iz;
 +
 +            for (iy = 0; iy < overlap; iy++)
 +            {
 +                for (iz = 0; iz < nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Copy periodic overlap in z */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +    for (ix = 0; ix < pme->pmegrid_nx; ix++)
 +    {
 +        int iy, iz;
 +
 +        for (iy = 0; iy < pme->pmegrid_ny; iy++)
 +        {
 +            for (iz = 0; iz < overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 +                    pmegrid[(ix*pny+iy)*pnz+iz];
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 +#define DO_BSPLINE(order)                            \
 +    for (ithx = 0; (ithx < order); ithx++)                    \
 +    {                                                    \
 +        index_x = (i0+ithx)*pny*pnz;                     \
 +        valx    = qn*thx[ithx];                          \
 +                                                     \
 +        for (ithy = 0; (ithy < order); ithy++)                \
 +        {                                                \
 +            valxy    = valx*thy[ithy];                   \
 +            index_xy = index_x+(j0+ithy)*pnz;            \
 +                                                     \
 +            for (ithz = 0; (ithz < order); ithz++)            \
 +            {                                            \
 +                index_xyz        = index_xy+(k0+ithz);   \
 +                grid[index_xyz] += valxy*thz[ithz];      \
 +            }                                            \
 +        }                                                \
 +    }
 +
 +
 +static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 +                                     pme_atomcomm_t *atc, splinedata_t *spline,
 +                                     pme_spline_work_t *work)
 +{
 +
 +    /* spread charges from home atoms to local grid */
 +    real          *grid;
 +    pme_overlap_t *ol;
 +    int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
 +    int       *    idxptr;
 +    int            order, norder, index_x, index_xy, index_xyz;
 +    real           valx, valxy, qn;
 +    real          *thx, *thy, *thz;
 +    int            localsize, bndsize;
 +    int            pnx, pny, pnz, ndatatot;
 +    int            offx, offy, offz;
 +
 +    pnx = pmegrid->s[XX];
 +    pny = pmegrid->s[YY];
 +    pnz = pmegrid->s[ZZ];
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    ndatatot = pnx*pny*pnz;
 +    grid     = pmegrid->grid;
 +    for (i = 0; i < ndatatot; i++)
 +    {
 +        grid[i] = 0;
 +    }
 +
 +    order = pmegrid->order;
 +
 +    for (nn = 0; nn < spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = atc->q[n];
 +
 +        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX] - offx;
 +            j0   = idxptr[YY] - offy;
 +            k0   = idxptr[ZZ] - offz;
 +
 +            thx = spline->theta[XX] + norder;
 +            thy = spline->theta[YY] + norder;
 +            thz = spline->theta[ZZ] + norder;
 +
 +            switch (order)
 +            {
 +                case 4:
 +#ifdef PME_SSE_SPREAD_GATHER
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_SPREAD_SSE_ORDER4
 +#else
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                    DO_BSPLINE(4);
 +#endif
 +                    break;
 +                case 5:
 +#ifdef PME_SSE_SPREAD_GATHER
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                    DO_BSPLINE(5);
 +#endif
 +                    break;
 +                default:
 +                    DO_BSPLINE(order);
 +                    break;
 +            }
 +        }
 +    }
 +}
 +
 +static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 +{
 +#ifdef PME_SSE_SPREAD_GATHER
 +    if (pme_order == 5
 +#ifndef PME_SSE_UNALIGNED
 +        || pme_order == 4
 +#endif
 +        )
 +    {
 +        /* Round nz up to a multiple of 4 to ensure alignment */
 +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 +    }
 +#endif
 +}
 +
 +static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
 +{
 +#ifdef PME_SSE_SPREAD_GATHER
 +#ifndef PME_SSE_UNALIGNED
 +    if (pme_order == 4)
 +    {
 +        /* Add extra elements to ensured aligned operations do not go
 +         * beyond the allocated grid size.
 +         * Note that for pme_order=5, the pme grid z-size alignment
 +         * ensures that we will not go beyond the grid size.
 +         */
 +        *gridsize += 4;
 +    }
 +#endif
 +#endif
 +}
 +
 +static void pmegrid_init(pmegrid_t *grid,
 +                         int cx, int cy, int cz,
 +                         int x0, int y0, int z0,
 +                         int x1, int y1, int z1,
 +                         gmx_bool set_alignment,
 +                         int pme_order,
 +                         real *ptr)
 +{
 +    int nz, gridsize;
 +
 +    grid->ci[XX]     = cx;
 +    grid->ci[YY]     = cy;
 +    grid->ci[ZZ]     = cz;
 +    grid->offset[XX] = x0;
 +    grid->offset[YY] = y0;
 +    grid->offset[ZZ] = z0;
 +    grid->n[XX]      = x1 - x0 + pme_order - 1;
 +    grid->n[YY]      = y1 - y0 + pme_order - 1;
 +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 +    copy_ivec(grid->n, grid->s);
 +
 +    nz = grid->s[ZZ];
 +    set_grid_alignment(&nz, pme_order);
 +    if (set_alignment)
 +    {
 +        grid->s[ZZ] = nz;
 +    }
 +    else if (nz != grid->s[ZZ])
 +    {
 +        gmx_incons("pmegrid_init call with an unaligned z size");
 +    }
 +
 +    grid->order = pme_order;
 +    if (ptr == NULL)
 +    {
 +        gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
 +        set_gridsize_alignment(&gridsize, pme_order);
 +        snew_aligned(grid->grid, gridsize, 16);
 +    }
 +    else
 +    {
 +        grid->grid = ptr;
 +    }
 +}
 +
 +static int div_round_up(int enumerator, int denominator)
 +{
 +    return (enumerator + denominator - 1)/denominator;
 +}
 +
 +static void make_subgrid_division(const ivec n, int ovl, int nthread,
 +                                  ivec nsub)
 +{
 +    int gsize_opt, gsize;
 +    int nsx, nsy, nsz;
 +    char *env;
 +
 +    gsize_opt = -1;
 +    for (nsx = 1; nsx <= nthread; nsx++)
 +    {
 +        if (nthread % nsx == 0)
 +        {
 +            for (nsy = 1; nsy <= nthread; nsy++)
 +            {
 +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 +                {
 +                    nsz = nthread/(nsx*nsy);
 +
 +                    /* Determine the number of grid points per thread */
 +                    gsize =
 +                        (div_round_up(n[XX], nsx) + ovl)*
 +                        (div_round_up(n[YY], nsy) + ovl)*
 +                        (div_round_up(n[ZZ], nsz) + ovl);
 +
 +                    /* Minimize the number of grids points per thread
 +                     * and, secondarily, the number of cuts in minor dimensions.
 +                     */
 +                    if (gsize_opt == -1 ||
 +                        gsize < gsize_opt ||
 +                        (gsize == gsize_opt &&
 +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 +                    {
 +                        nsub[XX]  = nsx;
 +                        nsub[YY]  = nsy;
 +                        nsub[ZZ]  = nsz;
 +                        gsize_opt = gsize;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    env = getenv("GMX_PME_THREAD_DIVISION");
 +    if (env != NULL)
 +    {
 +        sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
 +    }
 +
 +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 +    {
 +        gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
 +    }
 +}
 +
 +static void pmegrids_init(pmegrids_t *grids,
 +                          int nx, int ny, int nz, int nz_base,
 +                          int pme_order,
 +                          gmx_bool bUseThreads,
 +                          int nthread,
 +                          int overlap_x,
 +                          int overlap_y)
 +{
 +    ivec n, n_base, g0, g1;
 +    int t, x, y, z, d, i, tfac;
 +    int max_comm_lines = -1;
 +
 +    n[XX] = nx - (pme_order - 1);
 +    n[YY] = ny - (pme_order - 1);
 +    n[ZZ] = nz - (pme_order - 1);
 +
 +    copy_ivec(n, n_base);
 +    n_base[ZZ] = nz_base;
 +
 +    pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
 +                 NULL);
 +
 +    grids->nthread = nthread;
 +
 +    make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 +
 +    if (bUseThreads)
 +    {
 +        ivec nst;
 +        int gridsize;
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
 +        }
 +        set_grid_alignment(&nst[ZZ], pme_order);
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
 +                    grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
 +            fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 +                    nx, ny, nz,
 +                    nst[XX], nst[YY], nst[ZZ]);
 +        }
 +
 +        snew(grids->grid_th, grids->nthread);
 +        t        = 0;
 +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
 +        set_gridsize_alignment(&gridsize, pme_order);
 +        snew_aligned(grids->grid_all,
 +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 +                     16);
 +
 +        for (x = 0; x < grids->nc[XX]; x++)
 +        {
 +            for (y = 0; y < grids->nc[YY]; y++)
 +            {
 +                for (z = 0; z < grids->nc[ZZ]; z++)
 +                {
 +                    pmegrid_init(&grids->grid_th[t],
 +                                 x, y, z,
 +                                 (n[XX]*(x  ))/grids->nc[XX],
 +                                 (n[YY]*(y  ))/grids->nc[YY],
 +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
 +                                 (n[XX]*(x+1))/grids->nc[XX],
 +                                 (n[YY]*(y+1))/grids->nc[YY],
 +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
 +                                 TRUE,
 +                                 pme_order,
 +                                 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 +                    t++;
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        grids->grid_th = NULL;
 +    }
 +
 +    snew(grids->g2t, DIM);
 +    tfac = 1;
 +    for (d = DIM-1; d >= 0; d--)
 +    {
 +        snew(grids->g2t[d], n[d]);
 +        t = 0;
 +        for (i = 0; i < n[d]; i++)
 +        {
 +            /* The second check should match the parameters
 +             * of the pmegrid_init call above.
 +             */
 +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 +            {
 +                t++;
 +            }
 +            grids->g2t[d][i] = t*tfac;
 +        }
 +
 +        tfac *= grids->nc[d];
 +
 +        switch (d)
 +        {
 +            case XX: max_comm_lines = overlap_x;     break;
 +            case YY: max_comm_lines = overlap_y;     break;
 +            case ZZ: max_comm_lines = pme_order - 1; break;
 +        }
 +        grids->nthread_comm[d] = 0;
 +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
 +               grids->nthread_comm[d] < grids->nc[d])
 +        {
 +            grids->nthread_comm[d]++;
 +        }
 +        if (debug != NULL)
 +        {
 +            fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
 +                    'x'+d, grids->nthread_comm[d]);
 +        }
 +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 +         * work, but this is not a problematic restriction.
 +         */
 +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 +        {
 +            gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
 +        }
 +    }
 +}
 +
 +
 +static void pmegrids_destroy(pmegrids_t *grids)
 +{
 +    int t;
 +
 +    if (grids->grid.grid != NULL)
 +    {
 +        sfree(grids->grid.grid);
 +
 +        if (grids->nthread > 0)
 +        {
 +            for (t = 0; t < grids->nthread; t++)
 +            {
 +                sfree(grids->grid_th[t].grid);
 +            }
 +            sfree(grids->grid_th);
 +        }
 +    }
 +}
 +
 +
 +static void realloc_work(pme_work_t *work, int nkx)
 +{
 +    if (nkx > work->nalloc)
 +    {
 +        work->nalloc = nkx;
 +        srenew(work->mhx, work->nalloc);
 +        srenew(work->mhy, work->nalloc);
 +        srenew(work->mhz, work->nalloc);
 +        srenew(work->m2, work->nalloc);
 +        /* Allocate an aligned pointer for SIMD operations, including extra
 +         * elements at the end for padding.
 +         */
 +#ifdef PME_SIMD
 +#define ALIGN_HERE  GMX_SIMD_WIDTH_HERE
 +#else
 +/* We can use any alignment, apart from 0, so we use 4 */
 +#define ALIGN_HERE  4
 +#endif
 +        sfree_aligned(work->denom);
 +        sfree_aligned(work->tmp1);
 +        sfree_aligned(work->eterm);
 +        snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
 +        snew_aligned(work->tmp1,  work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
 +        snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
 +        srenew(work->m2inv, work->nalloc);
 +    }
 +}
 +
 +
 +static void free_work(pme_work_t *work)
 +{
 +    sfree(work->mhx);
 +    sfree(work->mhy);
 +    sfree(work->mhz);
 +    sfree(work->m2);
 +    sfree_aligned(work->denom);
 +    sfree_aligned(work->tmp1);
 +    sfree_aligned(work->eterm);
 +    sfree(work->m2inv);
 +}
 +
 +
 +#ifdef PME_SIMD
 +/* Calculate exponentials through SIMD */
 +inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 +{
 +    {
 +        const gmx_mm_pr two = gmx_set1_pr(2.0);
 +        gmx_mm_pr f_simd;
 +        gmx_mm_pr lu;
 +        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
 +        int kx;
 +        f_simd = gmx_load1_pr(&f);
 +        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
 +        {
 +            tmp_d1   = gmx_load_pr(d_aligned+kx);
 +            d_inv    = gmx_inv_pr(tmp_d1);
 +            tmp_r    = gmx_load_pr(r_aligned+kx);
 +            tmp_r    = gmx_exp_pr(tmp_r);
 +            tmp_e    = gmx_mul_pr(f_simd, d_inv);
 +            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
 +            gmx_store_pr(e_aligned+kx, tmp_e);
 +        }
 +    }
 +}
 +#else
 +inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
 +{
 +    int kx;
 +    for (kx = start; kx < end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +    for (kx = start; kx < end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +    for (kx = start; kx < end; kx++)
 +    {
 +        e[kx] = f*r[kx]*d[kx];
 +    }
 +}
 +#endif
 +
 +
 +static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
 +                         real ewaldcoeff, real vol,
 +                         gmx_bool bEnerVir,
 +                         int nthread, int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
 +    int     kx, ky, kz, maxkx, maxky, maxkz;
 +    int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
 +    real    mx, my, mz;
 +    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2, struct2, vfactor, ets2vf;
 +    real    d1, d2, energy = 0;
 +    real    by, bz;
 +    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    pme_work_t *work;
 +    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
 +    real    mhxk, mhyk, mhzk, m2k;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata, local_offset, local_size;
 +    real    elfac;
 +
 +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
 +
 +    work  = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    eterm = work->eterm;
 +    m2inv = work->m2inv;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for (iyz = iyz0; iyz < iyz1; iyz++)
 +    {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
 +        ky = iy + local_offset[YY];
 +
 +        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
 +        else
 +        {
 +            my = (ky - ny);
 +        }
 +
 +        by = M_PI*vol*pme->bsp_mod[YY][ky];
 +
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +
 +        /* We should skip the k-space point (0,0,0) */
 +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 +        {
 +            kxstart = local_offset[XX];
 +        }
 +        else
 +        {
 +            kxstart = local_offset[XX] + 1;
 +            p0++;
 +        }
 +        kxend = local_offset[XX] + local_ndata[XX];
 +
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the storage
 +             * of the mh elements in array's.
 +             * Because x is the minor grid index, all mh elements
 +             * depend on kx for triclinic unit cells.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                m2inv[kx] = 1.0/m2[kx];
 +            }
 +
 +            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
 +
 +            for (kx = kxstart; kx < kxend; kx++, p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +
 +                struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                tmp1[kx] = eterm[kx]*struct2;
 +            }
 +
 +            for (kx = kxstart; kx < kxend; kx++)
 +            {
 +                ets2     = corner_fac*tmp1[kx];
 +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 +                energy  += ets2;
 +
 +                ets2vf   = ets2*vfactor;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             * In this case the triclinic overhead is small.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +
 +            for (kx = kxstart; kx < maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for (kx = maxkx; kx < kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
 +
 +            for (kx = kxstart; kx < kxend; kx++, p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +            }
 +        }
 +    }
 +
 +    if (bEnerVir)
 +    {
 +        /* Update virial with local values.
 +         * The virial is symmetric by definition.
 +         * this virial seems ok for isotropic scaling, but I'm
 +         * experiencing problems on semiisotropic membranes.
 +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 +         */
 +        work->vir[XX][XX] = 0.25*virxx;
 +        work->vir[YY][YY] = 0.25*viryy;
 +        work->vir[ZZ][ZZ] = 0.25*virzz;
 +        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
 +        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
 +        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
 +
 +        /* This energy should be corrected for a charged system */
 +        work->energy = 0.5*energy;
 +    }
 +
 +    /* Return the loop count */
 +    return local_ndata[YY]*local_ndata[XX];
 +}
 +
 +static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
 +                             real *mesh_energy, matrix vir)
 +{
 +    /* This function sums output over threads
 +     * and should therefore only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy;
 +    copy_mat(pme->work[0].vir, vir);
 +
 +    for (thread = 1; thread < nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy;
 +        m_add(vir, pme->work[thread].vir, vir);
 +    }
 +}
 +
 +#define DO_FSPLINE(order)                      \
 +    for (ithx = 0; (ithx < order); ithx++)              \
 +    {                                              \
 +        index_x = (i0+ithx)*pny*pnz;               \
 +        tx      = thx[ithx];                       \
 +        dx      = dthx[ithx];                      \
 +                                               \
 +        for (ithy = 0; (ithy < order); ithy++)          \
 +        {                                          \
 +            index_xy = index_x+(j0+ithy)*pnz;      \
 +            ty       = thy[ithy];                  \
 +            dy       = dthy[ithy];                 \
 +            fxy1     = fz1 = 0;                    \
 +                                               \
 +            for (ithz = 0; (ithz < order); ithz++)      \
 +            {                                      \
 +                gval  = grid[index_xy+(k0+ithz)];  \
 +                fxy1 += thz[ithz]*gval;            \
 +                fz1  += dthz[ithz]*gval;           \
 +            }                                      \
 +            fx += dx*ty*fxy1;                      \
 +            fy += tx*dy*fxy1;                      \
 +            fz += tx*ty*fz1;                       \
 +        }                                          \
 +    }
 +
 +
 +static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 +                              gmx_bool bClearF, pme_atomcomm_t *atc,
 +                              splinedata_t *spline,
 +                              real scale)
 +{
 +    /* sum forces for local particles */
 +    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
 +    int     index_x, index_xy;
 +    int     nx, ny, nz, pnx, pny, pnz;
 +    int *   idxptr;
 +    real    tx, ty, dx, dy, qn;
 +    real    fx, fy, fz, gval;
 +    real    fxy1, fz1;
 +    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
 +    int     norder;
 +    real    rxx, ryx, ryy, rzx, rzy, rzz;
 +    int     order;
 +
 +    pme_spline_work_t *work;
 +
 +    work = pme->spline_work;
 +
 +    order = pme->pme_order;
 +    thx   = spline->theta[XX];
 +    thy   = spline->theta[YY];
 +    thz   = spline->theta[ZZ];
 +    dthx  = spline->dtheta[XX];
 +    dthy  = spline->dtheta[YY];
 +    dthz  = spline->dtheta[ZZ];
 +    nx    = pme->nkx;
 +    ny    = pme->nky;
 +    nz    = pme->nkz;
 +    pnx   = pme->pmegrid_nx;
 +    pny   = pme->pmegrid_ny;
 +    pnz   = pme->pmegrid_nz;
 +
 +    rxx   = pme->recipbox[XX][XX];
 +    ryx   = pme->recipbox[YY][XX];
 +    ryy   = pme->recipbox[YY][YY];
 +    rzx   = pme->recipbox[ZZ][XX];
 +    rzy   = pme->recipbox[ZZ][YY];
 +    rzz   = pme->recipbox[ZZ][ZZ];
 +
 +    for (nn = 0; nn < spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = scale*atc->q[n];
 +
 +        if (bClearF)
 +        {
 +            atc->f[n][XX] = 0;
 +            atc->f[n][YY] = 0;
 +            atc->f[n][ZZ] = 0;
 +        }
 +        if (qn != 0)
 +        {
 +            fx     = 0;
 +            fy     = 0;
 +            fz     = 0;
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next six statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +            dthx = spline->dtheta[XX] + norder;
 +            dthy = spline->dtheta[YY] + norder;
 +            dthz = spline->dtheta[ZZ] + norder;
 +
 +            switch (order)
 +            {
 +                case 4:
 +#ifdef PME_SSE_SPREAD_GATHER
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_GATHER_F_SSE_ORDER4
 +#else
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                    DO_FSPLINE(4);
 +#endif
 +                    break;
 +                case 5:
 +#ifdef PME_SSE_SPREAD_GATHER
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                    DO_FSPLINE(5);
 +#endif
 +                    break;
 +                default:
 +                    DO_FSPLINE(order);
 +                    break;
 +            }
 +
 +            atc->f[n][XX] += -qn*( fx*nx*rxx );
 +            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
 +            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
 +        }
 +    }
 +    /* Since the energy and not forces are interpolated
 +     * the net force might not be exactly zero.
 +     * This can be solved by also interpolating F, but
 +     * that comes at a cost.
 +     * A better hack is to remove the net force every
 +     * step, but that must be done at a higher level
 +     * since this routine doesn't see all atoms if running
 +     * in parallel. Don't know how important it is?  EL 990726
 +     */
 +}
 +
 +
 +static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
 +                                   pme_atomcomm_t *atc)
 +{
 +    splinedata_t *spline;
 +    int     n, ithx, ithy, ithz, i0, j0, k0;
 +    int     index_x, index_xy;
 +    int *   idxptr;
 +    real    energy, pot, tx, ty, qn, gval;
 +    real    *thx, *thy, *thz;
 +    int     norder;
 +    int     order;
 +
 +    spline = &atc->spline[0];
 +
 +    order = pme->pme_order;
 +
 +    energy = 0;
 +    for (n = 0; (n < atc->n); n++)
 +    {
 +        qn      = atc->q[n];
 +
 +        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = n*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next three statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +
 +            pot = 0;
 +            for (ithx = 0; (ithx < order); ithx++)
 +            {
 +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 +                tx      = thx[ithx];
 +
 +                for (ithy = 0; (ithy < order); ithy++)
 +                {
 +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 +                    ty       = thy[ithy];
 +
 +                    for (ithz = 0; (ithz < order); ithz++)
 +                    {
 +                        gval  = grid[index_xy+(k0+ithz)];
 +                        pot  += tx*ty*thz[ithz]*gval;
 +                    }
 +
 +                }
 +            }
 +
 +            energy += pot*qn;
 +        }
 +    }
 +
 +    return energy;
 +}
 +
 +/* Macro to force loop unrolling by fixing order.
 + * This gives a significant performance gain.
 + */
 +#define CALC_SPLINE(order)                     \
 +    {                                              \
 +        int j, k, l;                                 \
 +        real dr, div;                               \
 +        real data[PME_ORDER_MAX];                  \
 +        real ddata[PME_ORDER_MAX];                 \
 +                                               \
 +        for (j = 0; (j < DIM); j++)                     \
 +        {                                          \
 +            dr  = xptr[j];                         \
 +                                               \
 +            /* dr is relative offset from lower cell limit */ \
 +            data[order-1] = 0;                     \
 +            data[1]       = dr;                          \
 +            data[0]       = 1 - dr;                      \
 +                                               \
 +            for (k = 3; (k < order); k++)               \
 +            {                                      \
 +                div       = 1.0/(k - 1.0);               \
 +                data[k-1] = div*dr*data[k-2];      \
 +                for (l = 1; (l < (k-1)); l++)           \
 +                {                                  \
 +                    data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 +                                       data[k-l-1]);                \
 +                }                                  \
 +                data[0] = div*(1-dr)*data[0];      \
 +            }                                      \
 +            /* differentiate */                    \
 +            ddata[0] = -data[0];                   \
 +            for (k = 1; (k < order); k++)               \
 +            {                                      \
 +                ddata[k] = data[k-1] - data[k];    \
 +            }                                      \
 +                                               \
 +            div           = 1.0/(order - 1);                 \
 +            data[order-1] = div*dr*data[order-2];  \
 +            for (l = 1; (l < (order-1)); l++)           \
 +            {                                      \
 +                data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 +                                       (order-l-dr)*data[order-l-1]); \
 +            }                                      \
 +            data[0] = div*(1 - dr)*data[0];        \
 +                                               \
 +            for (k = 0; k < order; k++)                 \
 +            {                                      \
 +                theta[j][i*order+k]  = data[k];    \
 +                dtheta[j][i*order+k] = ddata[k];   \
 +            }                                      \
 +        }                                          \
 +    }
 +
 +void make_bsplines(splinevec theta, splinevec dtheta, int order,
 +                   rvec fractx[], int nr, int ind[], real charge[],
 +                   gmx_bool bFreeEnergy)
 +{
 +    /* construct splines for local atoms */
 +    int  i, ii;
 +    real *xptr;
 +
 +    for (i = 0; i < nr; i++)
 +    {
 +        /* With free energy we do not use the charge check.
 +         * In most cases this will be more efficient than calling make_bsplines
 +         * twice, since usually more than half the particles have charges.
 +         */
 +        ii = ind[i];
 +        if (bFreeEnergy || charge[ii] != 0.0)
 +        {
 +            xptr = fractx[ii];
 +            switch (order)
 +            {
 +                case 4:  CALC_SPLINE(4);     break;
 +                case 5:  CALC_SPLINE(5);     break;
 +                default: CALC_SPLINE(order); break;
 +            }
 +        }
 +    }
 +}
 +
 +
 +void make_dft_mod(real *mod, real *data, int ndata)
 +{
 +    int i, j;
 +    real sc, ss, arg;
 +
 +    for (i = 0; i < ndata; i++)
 +    {
 +        sc = ss = 0;
 +        for (j = 0; j < ndata; j++)
 +        {
 +            arg = (2.0*M_PI*i*j)/ndata;
 +            sc += data[j]*cos(arg);
 +            ss += data[j]*sin(arg);
 +        }
 +        mod[i] = sc*sc+ss*ss;
 +    }
 +    for (i = 0; i < ndata; i++)
 +    {
 +        if (mod[i] < 1e-7)
 +        {
 +            mod[i] = (mod[i-1]+mod[i+1])*0.5;
 +        }
 +    }
 +}
 +
 +
 +static void make_bspline_moduli(splinevec bsp_mod,
 +                                int nx, int ny, int nz, int order)
 +{
 +    int nmax = max(nx, max(ny, nz));
 +    real *data, *ddata, *bsp_data;
 +    int i, k, l;
 +    real div;
 +
 +    snew(data, order);
 +    snew(ddata, order);
 +    snew(bsp_data, nmax);
 +
 +    data[order-1] = 0;
 +    data[1]       = 0;
 +    data[0]       = 1;
 +
 +    for (k = 3; k < order; k++)
 +    {
 +        div       = 1.0/(k-1.0);
 +        data[k-1] = 0;
 +        for (l = 1; l < (k-1); l++)
 +        {
 +            data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
 +        }
 +        data[0] = div*data[0];
 +    }
 +    /* differentiate */
 +    ddata[0] = -data[0];
 +    for (k = 1; k < order; k++)
 +    {
 +        ddata[k] = data[k-1]-data[k];
 +    }
 +    div           = 1.0/(order-1);
 +    data[order-1] = 0;
 +    for (l = 1; l < (order-1); l++)
 +    {
 +        data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
 +    }
 +    data[0] = div*data[0];
 +
 +    for (i = 0; i < nmax; i++)
 +    {
 +        bsp_data[i] = 0;
 +    }
 +    for (i = 1; i <= order; i++)
 +    {
 +        bsp_data[i] = data[i-1];
 +    }
 +
 +    make_dft_mod(bsp_mod[XX], bsp_data, nx);
 +    make_dft_mod(bsp_mod[YY], bsp_data, ny);
 +    make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
 +
 +    sfree(data);
 +    sfree(ddata);
 +    sfree(bsp_data);
 +}
 +
 +
 +/* Return the P3M optimal influence function */
 +static double do_p3m_influence(double z, int order)
 +{
 +    double z2, z4;
 +
 +    z2 = z*z;
 +    z4 = z2*z2;
 +
 +    /* The formula and most constants can be found in:
 +     * Ballenegger et al., JCTC 8, 936 (2012)
 +     */
 +    switch (order)
 +    {
 +        case 2:
 +            return 1.0 - 2.0*z2/3.0;
 +            break;
 +        case 3:
 +            return 1.0 - z2 + 2.0*z4/15.0;
 +            break;
 +        case 4:
 +            return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
 +            break;
 +        case 5:
 +            return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
 +            break;
 +        case 6:
 +            return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
 +            break;
 +        case 7:
 +            return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
 +        case 8:
 +            return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
 +            break;
 +    }
 +
 +    return 0.0;
 +}
 +
 +/* Calculate the P3M B-spline moduli for one dimension */
 +static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
 +{
 +    double zarg, zai, sinzai, infl;
 +    int    maxk, i;
 +
 +    if (order > 8)
 +    {
 +        gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
 +    }
 +
 +    zarg = M_PI/n;
 +
 +    maxk = (n + 1)/2;
 +
 +    for (i = -maxk; i < 0; i++)
 +    {
 +        zai          = zarg*i;
 +        sinzai       = sin(zai);
 +        infl         = do_p3m_influence(sinzai, order);
 +        bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
 +    }
 +    bsp_mod[0] = 1.0;
 +    for (i = 1; i < maxk; i++)
 +    {
 +        zai        = zarg*i;
 +        sinzai     = sin(zai);
 +        infl       = do_p3m_influence(sinzai, order);
 +        bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
 +    }
 +}
 +
 +/* Calculate the P3M B-spline moduli */
 +static void make_p3m_bspline_moduli(splinevec bsp_mod,
 +                                    int nx, int ny, int nz, int order)
 +{
 +    make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
 +}
 +
 +
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +    int nslab, n, i;
 +    int fw, bw;
 +
 +    nslab = atc->nslab;
 +
 +    n = 0;
 +    for (i = 1; i <= nslab/2; i++)
 +    {
 +        fw = (atc->nodeid + i) % nslab;
 +        bw = (atc->nodeid - i + nslab) % nslab;
 +        if (n < nslab - 1)
 +        {
 +            atc->node_dest[n] = fw;
 +            atc->node_src[n]  = bw;
 +            n++;
 +        }
 +        if (n < nslab - 1)
 +        {
 +            atc->node_dest[n] = bw;
 +            atc->node_src[n]  = fw;
 +            n++;
 +        }
 +    }
 +}
 +
 +int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
 +{
 +    int thread;
 +
 +    if (NULL != log)
 +    {
 +        fprintf(log, "Destroying PME data structures.\n");
 +    }
 +
 +    sfree((*pmedata)->nnx);
 +    sfree((*pmedata)->nny);
 +    sfree((*pmedata)->nnz);
 +
 +    pmegrids_destroy(&(*pmedata)->pmegridA);
 +
 +    sfree((*pmedata)->fftgridA);
 +    sfree((*pmedata)->cfftgridA);
 +    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
 +
 +    if ((*pmedata)->pmegridB.grid.grid != NULL)
 +    {
 +        pmegrids_destroy(&(*pmedata)->pmegridB);
 +        sfree((*pmedata)->fftgridB);
 +        sfree((*pmedata)->cfftgridB);
 +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
 +    }
 +    for (thread = 0; thread < (*pmedata)->nthread; thread++)
 +    {
 +        free_work(&(*pmedata)->work[thread]);
 +    }
 +    sfree((*pmedata)->work);
 +
 +    sfree(*pmedata);
 +    *pmedata = NULL;
 +
 +    return 0;
 +}
 +
 +static int mult_up(int n, int f)
 +{
 +    return ((n + f - 1)/f)*f;
 +}
 +
 +
 +static double pme_load_imbalance(gmx_pme_t pme)
 +{
 +    int    nma, nmi;
 +    double n1, n2, n3;
 +
 +    nma = pme->nnodes_major;
 +    nmi = pme->nnodes_minor;
 +
 +    n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
 +    n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
 +    n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 +
 +    /* pme_solve is roughly double the cost of an fft */
 +
 +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 +}
 +
 +static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                          int dimind, gmx_bool bSpread)
 +{
 +    int nk, k, s, thread;
 +
 +    atc->dimind    = dimind;
 +    atc->nslab     = 1;
 +    atc->nodeid    = 0;
 +    atc->pd_nalloc = 0;
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        atc->mpi_comm = pme->mpi_comm_d[dimind];
 +        MPI_Comm_size(atc->mpi_comm, &atc->nslab);
 +        MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
 +    }
 +#endif
 +
 +    atc->bSpread   = bSpread;
 +    atc->pme_order = pme->pme_order;
 +
 +    if (atc->nslab > 1)
 +    {
 +        /* These three allocations are not required for particle decomp. */
 +        snew(atc->node_dest, atc->nslab);
 +        snew(atc->node_src, atc->nslab);
 +        setup_coordinate_communication(atc);
 +
 +        snew(atc->count_thread, pme->nthread);
 +        for (thread = 0; thread < pme->nthread; thread++)
 +        {
 +            snew(atc->count_thread[thread], atc->nslab);
 +        }
 +        atc->count = atc->count_thread[0];
 +        snew(atc->rcount, atc->nslab);
 +        snew(atc->buf_index, atc->nslab);
 +    }
 +
 +    atc->nthread = pme->nthread;
 +    if (atc->nthread > 1)
 +    {
 +        snew(atc->thread_plist, atc->nthread);
 +    }
 +    snew(atc->spline, atc->nthread);
 +    for (thread = 0; thread < atc->nthread; thread++)
 +    {
 +        if (atc->nthread > 1)
 +        {
 +            snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
 +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
 +        }
 +        snew(atc->spline[thread].thread_one, pme->nthread);
 +        atc->spline[thread].thread_one[thread] = 1;
 +    }
 +}
 +
 +static void
 +init_overlap_comm(pme_overlap_t *  ol,
 +                  int              norder,
 +#ifdef GMX_MPI
 +                  MPI_Comm         comm,
 +#endif
 +                  int              nnodes,
 +                  int              nodeid,
 +                  int              ndata,
 +                  int              commplainsize)
 +{
 +    int lbnd, rbnd, maxlr, b, i;
 +    int exten;
 +    int nn, nk;
 +    pme_grid_comm_t *pgc;
 +    gmx_bool bCont;
 +    int fft_start, fft_end, send_index1, recv_index1;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +
 +    ol->mpi_comm = comm;
 +#endif
 +
 +    ol->nnodes = nnodes;
 +    ol->nodeid = nodeid;
 +
 +    /* Linear translation of the PME grid won't affect reciprocal space
 +     * calculations, so to optimize we only interpolate "upwards",
 +     * which also means we only have to consider overlap in one direction.
 +     * I.e., particles on this node might also be spread to grid indices
 +     * that belong to higher nodes (modulo nnodes)
 +     */
 +
 +    snew(ol->s2g0, ol->nnodes+1);
 +    snew(ol->s2g1, ol->nnodes);
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab boundaries:");
 +    }
 +    for (i = 0; i < nnodes; i++)
 +    {
 +        /* s2g0 the local interpolation grid start.
 +         * s2g1 the local interpolation grid end.
 +         * Because grid overlap communication only goes forward,
 +         * the grid the slabs for fft's should be rounded down.
 +         */
 +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
 +        }
 +    }
 +    ol->s2g0[nnodes] = ndata;
 +    if (debug)
 +    {
 +        fprintf(debug, "\n");
 +    }
 +
 +    /* Determine with how many nodes we need to communicate the grid overlap */
 +    b = 0;
 +    do
 +    {
 +        b++;
 +        bCont = FALSE;
 +        for (i = 0; i < nnodes; i++)
 +        {
 +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 +            {
 +                bCont = TRUE;
 +            }
 +        }
 +    }
 +    while (bCont && b < nnodes);
 +    ol->noverlap_nodes = b - 1;
 +
 +    snew(ol->send_id, ol->noverlap_nodes);
 +    snew(ol->recv_id, ol->noverlap_nodes);
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 +    }
 +    snew(ol->comm_data, ol->noverlap_nodes);
 +
 +    ol->send_size = 0;
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        pgc = &ol->comm_data[b];
 +        /* Send */
 +        fft_start        = ol->s2g0[ol->send_id[b]];
 +        fft_end          = ol->s2g0[ol->send_id[b]+1];
 +        if (ol->send_id[b] < nodeid)
 +        {
 +            fft_start += ndata;
 +            fft_end   += ndata;
 +        }
 +        send_index1       = ol->s2g1[nodeid];
 +        send_index1       = min(send_index1, fft_end);
 +        pgc->send_index0  = fft_start;
 +        pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
 +        ol->send_size    += pgc->send_nindex;
 +
 +        /* We always start receiving to the first index of our slab */
 +        fft_start        = ol->s2g0[ol->nodeid];
 +        fft_end          = ol->s2g0[ol->nodeid+1];
 +        recv_index1      = ol->s2g1[ol->recv_id[b]];
 +        if (ol->recv_id[b] > nodeid)
 +        {
 +            recv_index1 -= ndata;
 +        }
 +        recv_index1      = min(recv_index1, fft_end);
 +        pgc->recv_index0 = fft_start;
 +        pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
 +    }
 +
 +#ifdef GMX_MPI
 +    /* Communicate the buffer sizes to receive */
 +    for (b = 0; b < ol->noverlap_nodes; b++)
 +    {
 +        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
 +                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
 +                     ol->mpi_comm, &stat);
 +    }
 +#endif
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    snew(ol->sendbuf, norder*commplainsize);
 +    snew(ol->recvbuf, norder*commplainsize);
 +}
 +
 +static void
 +make_gridindex5_to_localindex(int n, int local_start, int local_range,
 +                              int **global_to_local,
 +                              real **fraction_shift)
 +{
 +    int i;
 +    int * gtl;
 +    real * fsh;
 +
 +    snew(gtl, 5*n);
 +    snew(fsh, 5*n);
 +    for (i = 0; (i < 5*n); i++)
 +    {
 +        /* Determine the global to local grid index */
 +        gtl[i] = (i - local_start + n) % n;
 +        /* For coordinates that fall within the local grid the fraction
 +         * is correct, we don't need to shift it.
 +         */
 +        fsh[i] = 0;
 +        if (local_range < n)
 +        {
 +            /* Due to rounding issues i could be 1 beyond the lower or
 +             * upper boundary of the local grid. Correct the index for this.
 +             * If we shift the index, we need to shift the fraction by
 +             * the same amount in the other direction to not affect
 +             * the weights.
 +             * Note that due to this shifting the weights at the end of
 +             * the spline might change, but that will only involve values
 +             * between zero and values close to the precision of a real,
 +             * which is anyhow the accuracy of the whole mesh calculation.
 +             */
 +            /* With local_range=0 we should not change i=local_start */
 +            if (i % n != local_start)
 +            {
 +                if (gtl[i] == n-1)
 +                {
 +                    gtl[i] = 0;
 +                    fsh[i] = -1;
 +                }
 +                else if (gtl[i] == local_range)
 +                {
 +                    gtl[i] = local_range - 1;
 +                    fsh[i] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    *global_to_local = gtl;
 +    *fraction_shift  = fsh;
 +}
 +
 +static pme_spline_work_t *make_pme_spline_work(int order)
 +{
 +    pme_spline_work_t *work;
 +
 +#ifdef PME_SSE_SPREAD_GATHER
 +    float  tmp[8];
 +    __m128 zero_SSE;
 +    int    of, i;
 +
 +    snew_aligned(work, 1, 16);
 +
 +    zero_SSE = _mm_setzero_ps();
 +
 +    /* Generate bit masks to mask out the unused grid entries,
 +     * as we only operate on order of the 8 grid entries that are
 +     * load into 2 SSE float registers.
 +     */
 +    for (of = 0; of < 8-(order-1); of++)
 +    {
 +        for (i = 0; i < 8; i++)
 +        {
 +            tmp[i] = (i >= of && i < of+order ? 1 : 0);
 +        }
 +        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
 +        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
 +        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
 +        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
 +    }
 +#else
 +    work = NULL;
 +#endif
 +
 +    return work;
 +}
 +
 +void gmx_pme_check_restrictions(int pme_order,
 +                                int nkx, int nky, int nkz,
 +                                int nnodes_major,
 +                                int nnodes_minor,
 +                                gmx_bool bUseThreads,
 +                                gmx_bool bFatal,
 +                                gmx_bool *bValidSettings)
 +{
 +    if (pme_order > PME_ORDER_MAX)
 +    {
 +        if (!bFatal)
 +        {
 +            *bValidSettings = FALSE;
 +            return;
 +        }
 +        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 +                  pme_order, PME_ORDER_MAX);
 +    }
 +
 +    if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
 +        nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
 +        nkz <= pme_order)
 +    {
 +        if (!bFatal)
 +        {
 +            *bValidSettings = FALSE;
 +            return;
 +        }
 +        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
 +                  pme_order);
 +    }
 +
 +    /* Check for a limitation of the (current) sum_fftgrid_dd code.
 +     * We only allow multiple communication pulses in dim 1, not in dim 0.
 +     */
 +    if (bUseThreads && (nkx < nnodes_major*pme_order &&
 +                        nkx != nnodes_major*(pme_order - 1)))
 +    {
 +        if (!bFatal)
 +        {
 +            *bValidSettings = FALSE;
 +            return;
 +        }
 +        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
 +                  nkx/(double)nnodes_major, pme_order);
 +    }
 +
 +    if (bValidSettings != NULL)
 +    {
 +        *bValidSettings = TRUE;
 +    }
 +
 +    return;
 +}
 +
 +int gmx_pme_init(gmx_pme_t *         pmedata,
 +                 t_commrec *         cr,
 +                 int                 nnodes_major,
 +                 int                 nnodes_minor,
 +                 t_inputrec *        ir,
 +                 int                 homenr,
 +                 gmx_bool            bFreeEnergy,
 +                 gmx_bool            bReproducible,
 +                 int                 nthread)
 +{
 +    gmx_pme_t pme = NULL;
 +
 +    int  use_threads, sum_use_threads;
 +    ivec ndata;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Creating PME data structures.\n");
 +    }
 +    snew(pme, 1);
 +
 +    pme->redist_init         = FALSE;
 +    pme->sum_qgrid_tmp       = NULL;
 +    pme->sum_qgrid_dd_tmp    = NULL;
 +    pme->buf_nalloc          = 0;
 +    pme->redist_buf_nalloc   = 0;
 +
 +    pme->nnodes              = 1;
 +    pme->bPPnode             = TRUE;
 +
 +    pme->nnodes_major        = nnodes_major;
 +    pme->nnodes_minor        = nnodes_minor;
 +
 +#ifdef GMX_MPI
 +    if (nnodes_major*nnodes_minor > 1)
 +    {
 +        pme->mpi_comm = cr->mpi_comm_mygroup;
 +
 +        MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
 +        MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
 +        if (pme->nnodes != nnodes_major*nnodes_minor)
 +        {
 +            gmx_incons("PME node count mismatch");
 +        }
 +    }
 +    else
 +    {
 +        pme->mpi_comm = MPI_COMM_NULL;
 +    }
 +#endif
 +
 +    if (pme->nnodes == 1)
 +    {
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +        pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +        pme->ndecompdim   = 0;
 +        pme->nodeid_major = 0;
 +        pme->nodeid_minor = 0;
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +    }
 +    else
 +    {
 +        if (nnodes_minor == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = pme->mpi_comm;
 +            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +            pme->ndecompdim   = 1;
 +            pme->nodeid_major = pme->nodeid;
 +            pme->nodeid_minor = 0;
 +
 +        }
 +        else if (nnodes_major == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +            pme->mpi_comm_d[1] = pme->mpi_comm;
 +#endif
 +            pme->ndecompdim   = 1;
 +            pme->nodeid_major = 0;
 +            pme->nodeid_minor = pme->nodeid;
 +        }
 +        else
 +        {
 +            if (pme->nnodes % nnodes_major != 0)
 +            {
 +                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
 +            }
 +            pme->ndecompdim = 2;
 +
 +#ifdef GMX_MPI
 +            MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
 +                           pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 +            MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
 +                           pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 +
 +            MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
 +            MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
 +            MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
 +            MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 +#endif
 +        }
 +        pme->bPPnode = (cr->duty & DUTY_PP);
 +    }
 +
 +    pme->nthread = nthread;
 +
 +    /* Check if any of the PME MPI ranks uses threads */
 +    use_threads = (pme->nthread > 1 ? 1 : 0);
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
 +                      MPI_SUM, pme->mpi_comm);
 +    }
 +    else
 +#endif
 +    {
 +        sum_use_threads = use_threads;
 +    }
 +    pme->bUseThreads = (sum_use_threads > 0);
 +
 +    if (ir->ePBC == epbcSCREW)
 +    {
 +        gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
 +    }
 +
 +    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
 +    pme->nkx         = ir->nkx;
 +    pme->nky         = ir->nky;
 +    pme->nkz         = ir->nkz;
 +    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 +    pme->pme_order   = ir->pme_order;
 +    pme->epsilon_r   = ir->epsilon_r;
 +
 +    /* If we violate restrictions, generate a fatal error here */
 +    gmx_pme_check_restrictions(pme->pme_order,
 +                               pme->nkx, pme->nky, pme->nkz,
 +                               pme->nnodes_major,
 +                               pme->nnodes_minor,
 +                               pme->bUseThreads,
 +                               TRUE,
 +                               NULL);
 +
 +    if (pme->nnodes > 1)
 +    {
 +        double imbal;
 +
 +#ifdef GMX_MPI
 +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 +        MPI_Type_commit(&(pme->rvec_mpi));
 +#endif
 +
 +        /* Note that the charge spreading and force gathering, which usually
 +         * takes about the same amount of time as FFT+solve_pme,
 +         * is always fully load balanced
 +         * (unless the charge distribution is inhomogeneous).
 +         */
 +
 +        imbal = pme_load_imbalance(pme);
 +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 +                    "      For optimal PME load balancing\n"
 +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
 +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
 +                    "\n",
 +                    (int)((imbal-1)*100 + 0.5),
 +                    pme->nkx, pme->nky, pme->nnodes_major,
 +                    pme->nky, pme->nkz, pme->nnodes_minor);
 +        }
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 +     * y is always copied through a buffer: we don't need padding in z,
 +     * but we do need the overlap in x because of the communication order.
 +     */
 +    init_overlap_comm(&pme->overlap[0], pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[0],
 +#endif
 +                      pme->nnodes_major, pme->nodeid_major,
 +                      pme->nkx,
 +                      (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 +
 +    /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
 +     * We do this with an offset buffer of equal size, so we need to allocate
 +     * extra for the offset. That's what the (+1)*pme->nkz is for.
 +     */
 +    init_overlap_comm(&pme->overlap[1], pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[1],
 +#endif
 +                      pme->nnodes_minor, pme->nodeid_minor,
 +                      pme->nky,
 +                      (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 +
 +    /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
 +     * Note that gmx_pme_check_restrictions checked for this already.
 +     */
 +    if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
 +    {
 +        gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
 +    }
 +
 +    snew(pme->bsp_mod[XX], pme->nkx);
 +    snew(pme->bsp_mod[YY], pme->nky);
 +    snew(pme->bsp_mod[ZZ], pme->nkz);
 +
 +    /* The required size of the interpolation grid, including overlap.
 +     * The allocated size (pmegrid_n?) might be slightly larger.
 +     */
 +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 +        pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 +        pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_nz_base = pme->nkz;
 +    pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
 +    set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 +
 +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_start_iz = 0;
 +
 +    make_gridindex5_to_localindex(pme->nkx,
 +                                  pme->pmegrid_start_ix,
 +                                  pme->pmegrid_nx - (pme->pme_order-1),
 +                                  &pme->nnx, &pme->fshx);
 +    make_gridindex5_to_localindex(pme->nky,
 +                                  pme->pmegrid_start_iy,
 +                                  pme->pmegrid_ny - (pme->pme_order-1),
 +                                  &pme->nny, &pme->fshy);
 +    make_gridindex5_to_localindex(pme->nkz,
 +                                  pme->pmegrid_start_iz,
 +                                  pme->pmegrid_nz_base,
 +                                  &pme->nnz, &pme->fshz);
 +
 +    pmegrids_init(&pme->pmegridA,
 +                  pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 +                  pme->pmegrid_nz_base,
 +                  pme->pme_order,
 +                  pme->bUseThreads,
 +                  pme->nthread,
 +                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 +                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 +
 +    pme->spline_work = make_pme_spline_work(pme->pme_order);
 +
 +    ndata[0] = pme->nkx;
 +    ndata[1] = pme->nky;
 +    ndata[2] = pme->nkz;
 +
 +    /* This routine will allocate the grid data to fit the FFTs */
 +    gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
 +                            &pme->fftgridA, &pme->cfftgridA,
 +                            pme->mpi_comm_d,
 +                            bReproducible, pme->nthread);
 +
 +    if (bFreeEnergy)
 +    {
 +        pmegrids_init(&pme->pmegridB,
 +                      pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
 +                      pme->pmegrid_nz_base,
 +                      pme->pme_order,
 +                      pme->bUseThreads,
 +                      pme->nthread,
 +                      pme->nkx % pme->nnodes_major != 0,
 +                      pme->nky % pme->nnodes_minor != 0);
 +
 +        gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
 +                                &pme->fftgridB, &pme->cfftgridB,
 +                                pme->mpi_comm_d,
 +                                bReproducible, pme->nthread);
 +    }
 +    else
 +    {
 +        pme->pmegridB.grid.grid = NULL;
 +        pme->fftgridB           = NULL;
 +        pme->cfftgridB          = NULL;
 +    }
 +
 +    if (!pme->bP3M)
 +    {
 +        /* Use plain SPME B-spline interpolation */
 +        make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 +    }
 +    else
 +    {
 +        /* Use the P3M grid-optimized influence function */
 +        make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
 +    }
 +
 +    /* Use atc[0] for spreading */
 +    init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
 +    if (pme->ndecompdim >= 2)
 +    {
 +        init_atomcomm(pme, &pme->atc[1], 1, FALSE);
 +    }
 +
 +    if (pme->nnodes == 1)
 +    {
 +        pme->atc[0].n = homenr;
 +        pme_realloc_atomcomm_things(&pme->atc[0]);
 +    }
 +
 +    {
 +        int thread;
 +
 +        /* Use fft5d, order after FFT is y major, z, x minor */
 +
 +        snew(pme->work, pme->nthread);
 +        for (thread = 0; thread < pme->nthread; thread++)
 +        {
 +            realloc_work(&pme->work[thread], pme->nkx);
 +        }
 +    }
 +
 +    *pmedata = pme;
 +
 +    return 0;
 +}
 +
 +static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
 +{
 +    int d, t;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        if (new->grid.n[d] > old->grid.n[d])
 +        {
 +            return;
 +        }
 +    }
 +
 +    sfree_aligned(new->grid.grid);
 +    new->grid.grid = old->grid.grid;
 +
 +    if (new->grid_th != NULL && new->nthread == old->nthread)
 +    {
 +        sfree_aligned(new->grid_all);
 +        for (t = 0; t < new->nthread; t++)
 +        {
 +            new->grid_th[t].grid = old->grid_th[t].grid;
 +        }
 +    }
 +}
 +
 +int gmx_pme_reinit(gmx_pme_t *         pmedata,
 +                   t_commrec *         cr,
 +                   gmx_pme_t           pme_src,
 +                   const t_inputrec *  ir,
 +                   ivec                grid_size)
 +{
 +    t_inputrec irc;
 +    int homenr;
 +    int ret;
 +
 +    irc     = *ir;
 +    irc.nkx = grid_size[XX];
 +    irc.nky = grid_size[YY];
 +    irc.nkz = grid_size[ZZ];
 +
 +    if (pme_src->nnodes == 1)
 +    {
 +        homenr = pme_src->atc[0].n;
 +    }
 +    else
 +    {
 +        homenr = -1;
 +    }
 +
 +    ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
 +                       &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
 +
 +    if (ret == 0)
 +    {
 +        /* We can easily reuse the allocated pme grids in pme_src */
 +        reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
 +        /* We would like to reuse the fft grids, but that's harder */
 +    }
 +
 +    return ret;
 +}
 +
 +
 +static void copy_local_grid(gmx_pme_t pme,
 +                            pmegrids_t *pmegrids, int thread, real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    int  fft_my, fft_mz;
 +    int  nsx, nsy, nsz;
 +    ivec nf;
 +    int  offx, offy, offz, x, y, z, i0, i0t;
 +    int  d;
 +    pmegrid_t *pmegrid;
 +    real *grid_th;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    nsx = pmegrid->s[XX];
 +    nsy = pmegrid->s[YY];
 +    nsz = pmegrid->s[ZZ];
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
 +                    local_fft_ndata[d] - pmegrid->offset[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    /* Directly copy the non-overlapping parts of the local grids.
 +     * This also initializes the full grid.
 +     */
 +    grid_th = pmegrid->grid;
 +    for (x = 0; x < nf[XX]; x++)
 +    {
 +        for (y = 0; y < nf[YY]; y++)
 +        {
 +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 +            i0t = (x*nsy + y)*nsz;
 +            for (z = 0; z < nf[ZZ]; z++)
 +            {
 +                fftgrid[i0+z] = grid_th[i0t+z];
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +reduce_threadgrid_overlap(gmx_pme_t pme,
 +                          const pmegrids_t *pmegrids, int thread,
 +                          real *fftgrid, real *commbuf_x, real *commbuf_y)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    int  fft_nx, fft_ny, fft_nz;
 +    int  fft_my, fft_mz;
 +    int  buf_my = -1;
 +    int  nsx, nsy, nsz;
 +    ivec ne;
 +    int  offx, offy, offz, x, y, z, i0, i0t;
 +    int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
 +    gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
 +    gmx_bool bCommX, bCommY;
 +    int  d;
 +    int  thread_f;
 +    const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
 +    const real *grid_th;
 +    real *commbuf = NULL;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_nx = local_fft_ndata[XX];
 +    fft_ny = local_fft_ndata[YY];
 +    fft_nz = local_fft_ndata[ZZ];
 +
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    /* This routine is called when all thread have finished spreading.
 +     * Here each thread sums grid contributions calculated by other threads
 +     * to the thread local grid volume.
 +     * To minimize the number of grid copying operations,
 +     * this routines sums immediately from the pmegrid to the fftgrid.
 +     */
 +
 +    /* Determine which part of the full node grid we should operate on,
 +     * this is our thread local part of the full grid.
 +     */
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
 +                    local_fft_ndata[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +
 +    bClearBufX  = TRUE;
 +    bClearBufY  = TRUE;
 +    bClearBufXY = TRUE;
 +
 +    /* Now loop over all the thread data blocks that contribute
 +     * to the grid region we (our thread) are operating on.
 +     */
 +    /* Note that ffy_nx/y is equal to the number of grid points
 +     * between the first point of our node grid and the one of the next node.
 +     */
 +    for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
 +    {
 +        fx     = pmegrid->ci[XX] + sx;
 +        ox     = 0;
 +        bCommX = FALSE;
 +        if (fx < 0)
 +        {
 +            fx    += pmegrids->nc[XX];
 +            ox    -= fft_nx;
 +            bCommX = (pme->nnodes_major > 1);
 +        }
 +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 +        ox       += pmegrid_g->offset[XX];
 +        if (!bCommX)
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
 +        }
 +        else
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
 +        }
 +
 +        for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
 +        {
 +            fy     = pmegrid->ci[YY] + sy;
 +            oy     = 0;
 +            bCommY = FALSE;
 +            if (fy < 0)
 +            {
 +                fy    += pmegrids->nc[YY];
 +                oy    -= fft_ny;
 +                bCommY = (pme->nnodes_minor > 1);
 +            }
 +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 +            oy       += pmegrid_g->offset[YY];
 +            if (!bCommY)
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
 +            }
 +            else
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
 +            }
 +
 +            for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
 +            {
 +                fz = pmegrid->ci[ZZ] + sz;
 +                oz = 0;
 +                if (fz < 0)
 +                {
 +                    fz += pmegrids->nc[ZZ];
 +                    oz -= fft_nz;
 +                }
 +                pmegrid_g = &pmegrids->grid_th[fz];
 +                oz       += pmegrid_g->offset[ZZ];
 +                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
 +
 +                if (sx == 0 && sy == 0 && sz == 0)
 +                {
 +                    /* We have already added our local contribution
 +                     * before calling this routine, so skip it here.
 +                     */
 +                    continue;
 +                }
 +
 +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 +
 +                pmegrid_f = &pmegrids->grid_th[thread_f];
 +
 +                grid_th = pmegrid_f->grid;
 +
 +                nsx = pmegrid_f->s[XX];
 +                nsy = pmegrid_f->s[YY];
 +                nsz = pmegrid_f->s[ZZ];
 +
 +#ifdef DEBUG_PME_REDUCE
 +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 +                       pme->nodeid, thread, thread_f,
 +                       pme->pmegrid_start_ix,
 +                       pme->pmegrid_start_iy,
 +                       pme->pmegrid_start_iz,
 +                       sx, sy, sz,
 +                       offx-ox, tx1-ox, offx, tx1,
 +                       offy-oy, ty1-oy, offy, ty1,
 +                       offz-oz, tz1-oz, offz, tz1);
 +#endif
 +
 +                if (!(bCommX || bCommY))
 +                {
 +                    /* Copy from the thread local grid to the node grid */
 +                    for (x = offx; x < tx1; x++)
 +                    {
 +                        for (y = offy; y < ty1; y++)
 +                        {
 +                            i0  = (x*fft_my + y)*fft_mz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +                            for (z = offz; z < tz1; z++)
 +                            {
 +                                fftgrid[i0+z] += grid_th[i0t+z];
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* The order of this conditional decides
 +                     * where the corner volume gets stored with x+y decomp.
 +                     */
 +                    if (bCommY)
 +                    {
 +                        commbuf = commbuf_y;
 +                        buf_my  = ty1 - offy;
 +                        if (bCommX)
 +                        {
 +                            /* We index commbuf modulo the local grid size */
 +                            commbuf += buf_my*fft_nx*fft_nz;
 +
 +                            bClearBuf   = bClearBufXY;
 +                            bClearBufXY = FALSE;
 +                        }
 +                        else
 +                        {
 +                            bClearBuf  = bClearBufY;
 +                            bClearBufY = FALSE;
 +                        }
 +                    }
 +                    else
 +                    {
 +                        commbuf    = commbuf_x;
 +                        buf_my     = fft_ny;
 +                        bClearBuf  = bClearBufX;
 +                        bClearBufX = FALSE;
 +                    }
 +
 +                    /* Copy to the communication buffer */
 +                    for (x = offx; x < tx1; x++)
 +                    {
 +                        for (y = offy; y < ty1; y++)
 +                        {
 +                            i0  = (x*buf_my + y)*fft_nz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +
 +                            if (bClearBuf)
 +                            {
 +                                /* First access of commbuf, initialize it */
 +                                for (z = offz; z < tz1; z++)
 +                                {
 +                                    commbuf[i0+z]  = grid_th[i0t+z];
 +                                }
 +                            }
 +                            else
 +                            {
 +                                for (z = offz; z < tz1; z++)
 +                                {
 +                                    commbuf[i0+z] += grid_th[i0t+z];
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +    pme_overlap_t *overlap;
 +    int  send_index0, send_nindex;
 +    int  recv_nindex;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +#endif
 +    int  send_size_y, recv_size_y;
 +    int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
 +    real *sendptr, *recvptr;
 +    int  x, y, z, indg, indb;
 +
 +    /* Note that this routine is only used for forward communication.
 +     * Since the force gathering, unlike the charge spreading,
 +     * can be trivially parallelized over the particles,
 +     * the backwards process is much simpler and can use the "old"
 +     * communication setup.
 +     */
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    if (pme->nnodes_minor > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[1];
 +
 +        if (pme->nnodes_major > 1)
 +        {
 +            size_yx = pme->overlap[0].comm_data[0].send_nindex;
 +        }
 +        else
 +        {
 +            size_yx = 0;
 +        }
 +        datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
 +
 +        send_size_y = overlap->send_size;
 +
 +        for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
 +        {
 +            send_id       = overlap->send_id[ipulse];
 +            recv_id       = overlap->recv_id[ipulse];
 +            send_index0   =
 +                overlap->comm_data[ipulse].send_index0 -
 +                overlap->comm_data[0].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            /* We don't use recv_index0, as we always receive starting at 0 */
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_size_y   = overlap->comm_data[ipulse].recv_size;
 +
 +            sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
 +            recvptr = overlap->recvbuf;
 +
 +#ifdef GMX_MPI
 +            MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
 +                         send_id, ipulse,
 +                         recvptr, recv_size_y*datasize, GMX_MPI_REAL,
 +                         recv_id, ipulse,
 +                         overlap->mpi_comm, &stat);
 +#endif
 +
 +            for (x = 0; x < local_fft_ndata[XX]; x++)
 +            {
 +                for (y = 0; y < recv_nindex; y++)
 +                {
 +                    indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 +                    indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
 +                    for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                    {
 +                        fftgrid[indg+z] += recvptr[indb+z];
 +                    }
 +                }
 +            }
 +
 +            if (pme->nnodes_major > 1)
 +            {
 +                /* Copy from the received buffer to the send buffer for dim 0 */
 +                sendptr = pme->overlap[0].sendbuf;
 +                for (x = 0; x < size_yx; x++)
 +                {
 +                    for (y = 0; y < recv_nindex; y++)
 +                    {
 +                        indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                        indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
 +                        for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                        {
 +                            sendptr[indg+z] += recvptr[indb+z];
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* We only support a single pulse here.
 +     * This is not a severe limitation, as this code is only used
 +     * with OpenMP and with OpenMP the (PME) domains can be larger.
 +     */
 +    if (pme->nnodes_major > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[0];
 +
 +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id       = overlap->send_id[ipulse];
 +        recv_id       = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* We don't use recv_index0, as we always receive starting at 0 */
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        if (debug != NULL)
 +        {
 +            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
 +                    send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
 +                     send_id, ipulse,
 +                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
 +                     recv_id, ipulse,
 +                     overlap->mpi_comm, &stat);
 +#endif
 +
 +        for (x = 0; x < recv_nindex; x++)
 +        {
 +            for (y = 0; y < local_fft_ndata[YY]; y++)
 +            {
 +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                for (z = 0; z < local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void spread_on_grid(gmx_pme_t pme,
 +                           pme_atomcomm_t *atc, pmegrids_t *grids,
 +                           gmx_bool bCalcSplines, gmx_bool bSpread,
 +                           real *fftgrid)
 +{
 +    int nthread, thread;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
 +    static double cs1     = 0, cs2 = 0, cs3 = 0;
 +    static double cs1a[6] = {0, 0, 0, 0, 0, 0};
 +    static int cnt        = 0;
 +#endif
 +
 +    nthread = pme->nthread;
 +    assert(nthread > 0);
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    if (bCalcSplines)
 +    {
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +        for (thread = 0; thread < nthread; thread++)
 +        {
 +            int start, end;
 +
 +            start = atc->n* thread   /nthread;
 +            end   = atc->n*(thread+1)/nthread;
 +
 +            /* Compute fftgrid index for all atoms,
 +             * with help of some extra variables.
 +             */
 +            calc_interpolation_idx(pme, atc, start, end, thread);
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c1   = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        splinedata_t *spline;
 +        pmegrid_t *grid = NULL;
 +
 +        /* make local bsplines  */
 +        if (grids == NULL || !pme->bUseThreads)
 +        {
 +            spline = &atc->spline[0];
 +
 +            spline->n = atc->n;
 +
 +            if (bSpread)
 +            {
 +                grid = &grids->grid;
 +            }
 +        }
 +        else
 +        {
 +            spline = &atc->spline[thread];
 +
 +            if (grids->nthread == 1)
 +            {
 +                /* One thread, we operate on all charges */
 +                spline->n = atc->n;
 +            }
 +            else
 +            {
 +                /* Get the indices our thread should operate on */
 +                make_thread_local_ind(atc, thread, spline);
 +            }
 +
 +            grid = &grids->grid_th[thread];
 +        }
 +
 +        if (bCalcSplines)
 +        {
 +            make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
 +                          atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
 +        }
 +
 +        if (bSpread)
 +        {
 +            /* put local atoms on grid. */
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_start();
 +#endif
 +            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
 +
 +            if (pme->bUseThreads)
 +            {
 +                copy_local_grid(pme, grids, thread, fftgrid);
 +            }
 +#ifdef PME_TIME_SPREAD
 +            ct1a          = omp_cyc_end(ct1a);
 +            cs1a[thread] += (double)ct1a;
 +#endif
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c2   = omp_cyc_end(c2);
 +    cs2 += (double)c2;
 +#endif
 +
 +    if (bSpread && pme->bUseThreads)
 +    {
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
 +        for (thread = 0; thread < grids->nthread; thread++)
 +        {
 +            reduce_threadgrid_overlap(pme, grids, thread,
 +                                      fftgrid,
 +                                      pme->overlap[0].sendbuf,
 +                                      pme->overlap[1].sendbuf);
 +        }
 +#ifdef PME_TIME_THREADS
 +        c3   = omp_cyc_end(c3);
 +        cs3 += (double)c3;
 +#endif
 +
 +        if (pme->nnodes > 1)
 +        {
 +            /* Communicate the overlapping part of the fftgrid.
 +             * For this communication call we need to check pme->bUseThreads
 +             * to have all ranks communicate here, regardless of pme->nthread.
 +             */
 +            sum_fftgrid_dd(pme, fftgrid);
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("idx %.2f spread %.2f red %.2f",
 +               cs1*1e-9, cs2*1e-9, cs3*1e-9);
 +#ifdef PME_TIME_SPREAD
 +        for (thread = 0; thread < nthread; thread++)
 +        {
 +            printf(" %.2f", cs1a[thread]*1e-9);
 +        }
 +#endif
 +        printf("\n");
 +    }
 +#endif
 +}
 +
 +
 +static void dump_grid(FILE *fp,
 +                      int sx, int sy, int sz, int nx, int ny, int nz,
 +                      int my, int mz, const real *g)
 +{
 +    int x, y, z;
 +
 +    for (x = 0; x < nx; x++)
 +    {
 +        for (y = 0; y < ny; y++)
 +        {
 +            for (z = 0; z < nz; z++)
 +            {
 +                fprintf(fp, "%2d %2d %2d %6.3f\n",
 +                        sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
 +{
 +    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    dump_grid(stderr,
 +              pme->pmegrid_start_ix,
 +              pme->pmegrid_start_iy,
 +              pme->pmegrid_start_iz,
 +              pme->pmegrid_nx-pme->pme_order+1,
 +              pme->pmegrid_ny-pme->pme_order+1,
 +              pme->pmegrid_nz-pme->pme_order+1,
 +              local_fft_size[YY],
 +              local_fft_size[ZZ],
 +              fftgrid);
 +}
 +
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
 +{
 +    pme_atomcomm_t *atc;
 +    pmegrids_t *grid;
 +
 +    if (pme->nnodes > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy called in parallel");
 +    }
 +    if (pme->bFEP > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy with free energy");
 +    }
 +
 +    atc            = &pme->atc_energy;
 +    atc->nthread   = 1;
 +    if (atc->spline == NULL)
 +    {
 +        snew(atc->spline, atc->nthread);
 +    }
 +    atc->nslab     = 1;
 +    atc->bSpread   = TRUE;
 +    atc->pme_order = pme->pme_order;
 +    atc->n         = n;
 +    pme_realloc_atomcomm_things(atc);
 +    atc->x         = x;
 +    atc->q         = q;
 +
 +    /* We only use the A-charges grid */
 +    grid = &pme->pmegridA;
 +
 +    /* Only calculate the spline coefficients, don't actually spread */
 +    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
 +
 +    *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 +}
 +
 +
 +static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
++                                   gmx_runtime_t *runtime,
 +                                   t_nrnb *nrnb, t_inputrec *ir,
 +                                   gmx_large_int_t step)
 +{
 +    /* Reset all the counters related to performance over the run */
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    init_nrnb(nrnb);
 +    if (ir->nsteps >= 0)
 +    {
 +        /* ir->nsteps is not used here, but we update it for consistency */
 +        ir->nsteps -= step - ir->init_step;
 +    }
 +    ir->init_step = step;
 +    wallcycle_start(wcycle, ewcRUN);
++    runtime_start(runtime);
 +}
 +
 +
 +static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
 +                               ivec grid_size,
 +                               t_commrec *cr, t_inputrec *ir,
 +                               gmx_pme_t *pme_ret)
 +{
 +    int ind;
 +    gmx_pme_t pme = NULL;
 +
 +    ind = 0;
 +    while (ind < *npmedata)
 +    {
 +        pme = (*pmedata)[ind];
 +        if (pme->nkx == grid_size[XX] &&
 +            pme->nky == grid_size[YY] &&
 +            pme->nkz == grid_size[ZZ])
 +        {
 +            *pme_ret = pme;
 +
 +            return;
 +        }
 +
 +        ind++;
 +    }
 +
 +    (*npmedata)++;
 +    srenew(*pmedata, *npmedata);
 +
 +    /* Generate a new PME data structure, copying part of the old pointers */
 +    gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
 +
 +    *pme_ret = (*pmedata)[ind];
 +}
 +
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,    t_nrnb *nrnb,
 +                gmx_wallcycle_t wcycle,
++                gmx_runtime_t *runtime,
 +                real ewaldcoeff,
 +                t_inputrec *ir)
 +{
 +    int npmedata;
 +    gmx_pme_t *pmedata;
 +    gmx_pme_pp_t pme_pp;
 +    int  ret;
 +    int  natoms;
 +    matrix box;
 +    rvec *x_pp      = NULL, *f_pp = NULL;
 +    real *chargeA   = NULL, *chargeB = NULL;
 +    real lambda     = 0;
 +    int  maxshift_x = 0, maxshift_y = 0;
 +    real energy, dvdlambda;
 +    matrix vir;
 +    float cycles;
 +    int  count;
 +    gmx_bool bEnerVir;
 +    gmx_large_int_t step, step_rel;
 +    ivec grid_switch;
 +
 +    /* This data will only use with PME tuning, i.e. switching PME grids */
 +    npmedata = 1;
 +    snew(pmedata, npmedata);
 +    pmedata[0] = pme;
 +
 +    pme_pp = gmx_pme_pp_init(cr);
 +
 +    init_nrnb(nrnb);
 +
 +    count = 0;
 +    do /****** this is a quasi-loop over time steps! */
 +    {
 +        /* The reason for having a loop here is PME grid tuning/switching */
 +        do
 +        {
 +            /* Domain decomposition */
 +            ret = gmx_pme_recv_q_x(pme_pp,
 +                                   &natoms,
 +                                   &chargeA, &chargeB, box, &x_pp, &f_pp,
 +                                   &maxshift_x, &maxshift_y,
 +                                   &pme->bFEP, &lambda,
 +                                   &bEnerVir,
 +                                   &step,
 +                                   grid_switch, &ewaldcoeff);
 +
 +            if (ret == pmerecvqxSWITCHGRID)
 +            {
 +                /* Switch the PME grid to grid_switch */
 +                gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
 +            }
 +
 +            if (ret == pmerecvqxRESETCOUNTERS)
 +            {
 +                /* Reset the cycle and flop counters */
-                 reset_pmeonly_counters(wcycle, nrnb, ir, step);
++                reset_pmeonly_counters(wcycle, runtime, nrnb, ir, step);
 +            }
 +        }
 +        while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
 +
 +        if (ret == pmerecvqxFINISH)
 +        {
 +            /* We should stop: break out of the loop */
 +            break;
 +        }
 +
 +        step_rel = step - ir->init_step;
 +
 +        if (count == 0)
 +        {
 +            wallcycle_start(wcycle, ewcRUN);
++            runtime_start(runtime);
 +        }
 +
 +        wallcycle_start(wcycle, ewcPMEMESH);
 +
 +        dvdlambda = 0;
 +        clear_mat(vir);
 +        gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
 +                   cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
 +                   &energy, lambda, &dvdlambda,
 +                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 +
 +        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 +
 +        gmx_pme_send_force_vir_ener(pme_pp,
 +                                    f_pp, vir, energy, dvdlambda,
 +                                    cycles);
 +
 +        count++;
 +    } /***** end of quasi-loop, we stop with the break above */
 +    while (TRUE);
 +
++    runtime_end(runtime);
++
 +    return 0;
 +}
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real *chargeA,   real *chargeB,
 +               matrix box, t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix vir,      real ewaldcoeff,
 +               real *energy,    real lambda,
 +               real *dvdlambda, int flags)
 +{
 +    int     q, d, i, j, ntot, npme;
 +    int     nx, ny, nz;
 +    int     n_d, local_ny;
 +    pme_atomcomm_t *atc = NULL;
 +    pmegrids_t *pmegrid = NULL;
 +    real    *grid       = NULL;
 +    real    *ptr;
 +    rvec    *x_d, *f_d;
 +    real    *charge = NULL, *q_d;
 +    real    energy_AB[2];
 +    matrix  vir_AB[2];
 +    gmx_bool bClearF;
 +    gmx_parallel_3dfft_t pfft_setup;
 +    real *  fftgrid;
 +    t_complex * cfftgrid;
 +    int     thread;
 +    const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
 +    const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 +
 +    assert(pme->nnodes > 0);
 +    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 +
 +    if (pme->nnodes > 1)
 +    {
 +        atc      = &pme->atc[0];
 +        atc->npd = homenr;
 +        if (atc->npd > atc->pd_nalloc)
 +        {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd, atc->pd_nalloc);
 +        }
 +        atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 +    }
 +    else
 +    {
 +        /* This could be necessary for TPI */
 +        pme->atc[0].n = homenr;
 +    }
 +
 +    for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
 +    {
 +        if (q == 0)
 +        {
 +            pmegrid    = &pme->pmegridA;
 +            fftgrid    = pme->fftgridA;
 +            cfftgrid   = pme->cfftgridA;
 +            pfft_setup = pme->pfft_setupA;
 +            charge     = chargeA+start;
 +        }
 +        else
 +        {
 +            pmegrid    = &pme->pmegridB;
 +            fftgrid    = pme->fftgridB;
 +            cfftgrid   = pme->cfftgridB;
 +            pfft_setup = pme->pfft_setupB;
 +            charge     = chargeB+start;
 +        }
 +        grid = pmegrid->grid.grid;
 +        /* Unpack structure */
 +        if (debug)
 +        {
 +            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
 +                    cr->nnodes, cr->nodeid);
 +            fprintf(debug, "Grid = %p\n", (void*)grid);
 +            if (grid == NULL)
 +            {
 +                gmx_fatal(FARGS, "No grid!");
 +            }
 +        }
 +        where();
 +
 +        m_inv_ur0(box, pme->recipbox);
 +
 +        if (pme->nnodes == 1)
 +        {
 +            atc = &pme->atc[0];
 +            if (DOMAINDECOMP(cr))
 +            {
 +                atc->n = homenr;
 +                pme_realloc_atomcomm_things(atc);
 +            }
 +            atc->x = x;
 +            atc->q = charge;
 +            atc->f = f;
 +        }
 +        else
 +        {
 +            wallcycle_start(wcycle, ewcPME_REDISTXF);
 +            for (d = pme->ndecompdim-1; d >= 0; d--)
 +            {
 +                if (d == pme->ndecompdim-1)
 +                {
 +                    n_d = homenr;
 +                    x_d = x + start;
 +                    q_d = charge;
 +                }
 +                else
 +                {
 +                    n_d = pme->atc[d+1].n;
 +                    x_d = atc->x;
 +                    q_d = atc->q;
 +                }
 +                atc      = &pme->atc[d];
 +                atc->npd = n_d;
 +                if (atc->npd > atc->pd_nalloc)
 +                {
 +                    atc->pd_nalloc = over_alloc_dd(atc->npd);
 +                    srenew(atc->pd, atc->pd_nalloc);
 +                }
 +                atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
 +                pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
 +                where();
 +
 +                /* Redistribute x (only once) and qA or qB */
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
 +                }
 +                else
 +                {
 +                    pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
 +                }
 +            }
 +            where();
 +
 +            wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
 +                    cr->nodeid, atc->n);
 +        }
 +
 +        if (flags & GMX_PME_SPREAD_Q)
 +        {
 +            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +
 +            /* Spread the charges on a grid */
 +            spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
 +
 +            if (q == 0)
 +            {
 +                inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
 +            }
 +            inc_nrnb(nrnb, eNR_SPREADQBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +
 +            if (!pme->bUseThreads)
 +            {
 +                wrap_periodic_pmegrid(pme, grid);
 +
 +                /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
 +                if (pme->nnodes > 1)
 +                {
 +                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
 +                    where();
 +                }
 +#endif
 +
 +                copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
 +            }
 +
 +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +
 +            /*
 +               dump_local_fftgrid(pme,fftgrid);
 +               exit(0);
 +             */
 +        }
 +
 +        /* Here we start a large thread parallel region */
 +#pragma omp parallel num_threads(pme->nthread) private(thread)
 +        {
 +            thread = gmx_omp_get_thread_num();
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                int loop_count;
 +
 +                /* do 3d-fft */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle, ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
 +                                           thread, wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_FFT);
 +                }
 +                where();
 +
 +                /* solve in k-space for our local cells */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle, ewcPME_SOLVE);
 +                }
 +                loop_count =
 +                    solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
 +                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                  bCalcEnerVir,
 +                                  pme->nthread, thread);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_SOLVE);
 +                    where();
 +                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
 +                }
 +            }
 +
 +            if (bCalcF)
 +            {
 +                /* do 3d-invfft */
 +                if (thread == 0)
 +                {
 +                    where();
 +                    wallcycle_start(wcycle, ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
 +                                           thread, wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle, ewcPME_FFT);
 +
 +                    where();
 +
 +                    if (pme->nodeid == 0)
 +                    {
 +                        ntot  = pme->nkx*pme->nky*pme->nkz;
 +                        npme  = ntot*log((real)ntot)/log(2.0);
 +                        inc_nrnb(nrnb, eNR_FFT, 2*npme);
 +                    }
 +
 +                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 +                }
 +
 +                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
 +            }
 +        }
 +        /* End of thread parallel section.
 +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
 +         */
 +
 +        if (bCalcF)
 +        {
 +            /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +            if (pme->nnodes > 1)
 +            {
 +                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
 +            }
 +#endif
 +            where();
 +
 +            unwrap_periodic_pmegrid(pme, grid);
 +
 +            /* interpolate forces for our local atoms */
 +
 +            where();
 +
 +            /* If we are running without parallelization,
 +             * atc->f is the actual force array, not a buffer,
 +             * therefore we should not clear it.
 +             */
 +            bClearF = (q == 0 && PAR(cr));
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +            for (thread = 0; thread < pme->nthread; thread++)
 +            {
 +                gather_f_bsplines(pme, grid, bClearF, atc,
 +                                  &atc->spline[thread],
 +                                  pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
 +            }
 +
 +            where();
 +
 +            inc_nrnb(nrnb, eNR_GATHERFBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 +        }
 +
 +        if (bCalcEnerVir)
 +        {
 +            /* This should only be called on the master thread
 +             * and after the threads have synchronized.
 +             */
 +            get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
 +        }
 +    } /* of q-loop */
 +
 +    if (bCalcF && pme->nnodes > 1)
 +    {
 +        wallcycle_start(wcycle, ewcPME_REDISTXF);
 +        for (d = 0; d < pme->ndecompdim; d++)
 +        {
 +            atc = &pme->atc[d];
 +            if (d == pme->ndecompdim - 1)
 +            {
 +                n_d = homenr;
 +                f_d = f + start;
 +            }
 +            else
 +            {
 +                n_d = pme->atc[d+1].n;
 +                f_d = pme->atc[d+1].f;
 +            }
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_pmeredist_f(pme, atc, n_d, f_d,
 +                               d == pme->ndecompdim-1 && pme->bPPnode);
 +            }
 +            else
 +            {
 +                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
 +            }
 +        }
 +
 +        wallcycle_stop(wcycle, ewcPME_REDISTXF);
 +    }
 +    where();
 +
 +    if (bCalcEnerVir)
 +    {
 +        if (!pme->bFEP)
 +        {
 +            *energy = energy_AB[0];
 +            m_add(vir, vir_AB[0], vir);
 +        }
 +        else
 +        {
 +            *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
 +            *dvdlambda += energy_AB[1] - energy_AB[0];
 +            for (i = 0; i < DIM; i++)
 +            {
 +                for (j = 0; j < DIM; j++)
 +                {
 +                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
 +                        lambda*vir_AB[1][i][j];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        *energy = 0;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME mesh energy: %g\n", *energy);
 +    }
 +
 +    return 0;
 +}
diff --cc src/programs/mdrun/md.c
index a5e558431c,0000000000..4c48b7266b
mode 100644,000000..100644
--- a/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.c
@@@ -1,2190 -1,0 +1,2190 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "pme_loadbal.h"
 +#include "bondf.h"
 +#include "membed.h"
 +#include "types/nlistheuristics.h"
 +#include "types/iteratedconstraints.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +static void reset_all_counters(FILE *fplog, t_commrec *cr,
 +                               gmx_large_int_t step,
 +                               gmx_large_int_t *step_rel, t_inputrec *ir,
 +                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
 +                               gmx_runtime_t *runtime,
 +                               nbnxn_cuda_ptr_t cu_nbv)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
 +                  gmx_step_str(step, sbuf));
 +
 +    if (cu_nbv)
 +    {
 +        nbnxn_cuda_reset_timings(cu_nbv);
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel      = 0;
 +    wallcycle_start(wcycle, ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
 +}
 +
 +double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int stepout, t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed, t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
 +             real cpt_period, real max_hours,
 +             const char gmx_unused *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t   *outf;
 +    gmx_large_int_t step, step_rel;
 +    double          run_time;
 +    double          t, t0, lam0[efptNR];
 +    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
 +    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
 +                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
 +                    bBornRadii, bStartingFromCpt;
 +    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 +    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
 +                      bForceUpdate = FALSE, bCPT;
 +    int               mdof_flags;
 +    gmx_bool          bMasterState;
 +    int               force_flags, cglo_flags;
 +    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
 +    int               i, m;
 +    t_trxstatus      *status;
 +    rvec              mu_tot;
 +    t_vcm            *vcm;
 +    t_state          *bufstate = NULL;
 +    matrix           *scale_tot, pcoupl_mu, M, ebox;
 +    gmx_nlheur_t      nlh;
 +    t_trxframe        rerun_fr;
 +    gmx_repl_ex_t     repl_ex = NULL;
 +    int               nchkpt  = 1;
 +    gmx_localtop_t   *top;
 +    t_mdebin         *mdebin = NULL;
 +    df_history_t      df_history;
 +    t_state          *state    = NULL;
 +    rvec             *f_global = NULL;
 +    int               n_xtc    = -1;
 +    rvec             *x_xtc    = NULL;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f = NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t      upd   = NULL;
 +    t_graph          *graph = NULL;
 +    globsig_t         gs;
 +    gmx_rng_t         mcrng = NULL;
 +    gmx_groups_t     *groups;
 +    gmx_ekindata_t   *ekind, *ekind_save;
 +    gmx_shellfc_t     shellfc;
 +    int               count, nconverged = 0;
 +    real              timestep = 0;
 +    double            tcount   = 0;
 +    gmx_bool          bIonize  = FALSE;
 +    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
 +    gmx_bool          bAppend;
 +    gmx_bool          bResetCountersHalfMaxH = FALSE;
 +    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
 +    gmx_bool          bUpdateDoLR;
 +    real              mu_aver = 0, dvdl_constr;
 +    int               a0, a1, gnx = 0, ii;
 +    atom_id          *grpindex = NULL;
 +    char             *grpname;
 +    t_coupl_rec      *tcr     = NULL;
 +    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
 +    matrix            boxcopy = {{0}}, lastbox;
 +    tensor            tmpvir;
 +    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
 +    real              vetanew = 0;
 +    int               lamnew  = 0;
 +    /* for FEP */
 +    int               nstfep;
 +    real              rate;
 +    double            cycles;
 +    real              saved_conserved_quantity = 0;
 +    real              last_ekin                = 0;
 +    int               iter_i;
 +    t_extmass         MassQ;
 +    int             **trotter_seq;
 +    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 +    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t     iterate;
 +    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
 +                                                                      simulation stops. If equal to zero, don't
 +                                                                      communicate any more between multisims.*/
 +    /* PME load balancing data for GPU kernels */
 +    pme_load_balancing_t pme_loadbal = NULL;
 +    double               cycles_pmes;
 +    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
 +
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf, top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */
 +    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
 +                                          false in this step.  The correct value, true or false,
 +                                          is set at each step, as it depends on the frequency of temperature
 +                                          and pressure control.*/
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
 +
 +    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
 +            &(state_global->fep_state), lam0,
 +            nrnb, top_global, &upd,
 +            nfile, fnm, &outf, &mdebin,
 +            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f, top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history, &state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
 +    /* Copy the cos acceleration to the groups struct */
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global, n_flexible_constraints(constr),
 +                                 (ir->bContinuation ||
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
 +        if ((io > 2000) && MASTER(cr))
 +        {
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state, 1);
 +        dd_init_local_state(cr->dd, state_global, state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout)
 +        {
 +            snew(f_global, state_global->natoms);
 +        }
 +    }
 +    else
 +    {
 +        if (PAR(cr))
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +            pd_at_range(cr, &a0, &a1);
 +        }
 +        else
 +        {
 +            top = gmx_mtop_generate_local_top(top_global, ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        forcerec_set_excl_load(fr, top, cr);
 +
 +        state    = partdec_init_local_state(cr, state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, top, mdatoms, cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +
 +        if (shellfc)
 +        {
 +            make_local_shells(cr, mdatoms, shellfc);
 +        }
 +
-         init_bonded_thread_force_reduction(fr, &top->idef);
++        setup_bonded_threading(fr, &top->idef);
 +
 +        if (ir->pull && PAR(cr))
 +        {
 +            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            state, &f, mdatoms, top, fr,
 +                            vsite, shellfc, constr,
 +                            nrnb, wcycle, FALSE);
 +
 +    }
 +
 +    update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if (Flags & MD_APPENDFILES)
 +            {
 +                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist, mdebin);
 +    }
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd, state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng, state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr)
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, top, ir, mdatoms, cr);
 +        }
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT, nfile, fnm);
 +    if (bTCR)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex, gnx);
 +        for (i = 0; (i < gnx); i++)
 +        {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
 +                        "repl_ex_nst", &repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
 +    }
 +
 +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
 +     * With perturbed charges with soft-core we should not change the cut-off.
 +     */
 +    if ((Flags & MD_TUNEPME) &&
 +        EEL_PME(fr->eeltype) &&
 +        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
 +        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
 +        !bRerunMD)
 +    {
 +        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
 +        cycles_pmes = 0;
 +        if (cr->duty & DUTY_PME)
 +        {
 +            /* Start tuning right away, as we can't measure the load */
 +            bPMETuneRunning = TRUE;
 +        }
 +        else
 +        {
 +            /* Separate PME nodes, we can measure the PP/PME load balance */
 +            bPMETuneTry = TRUE;
 +        }
 +    }
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for (m = 0; m < DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog, constr, ir, mdatoms, state,
 +                               cr, nrnb, fr, top);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(vsite, state->x, ir->delta_t, NULL,
 +                             top->idef.iparams, top->idef.il,
 +                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +
 +    /* set free energy calculation frequency as the minimum
 +       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded)
 +    {
 +        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl, nstfep);
 +    }
 +    if (repl_ex_nst > 0)
 +    {
 +        nstfep = gmx_greatest_common_divisor(repl_ex_nst, nstfep);
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
 +                  | (bVV ? CGLO_PRESSURE : 0)
 +                  | (bVV ? CGLO_CONSTRAINT : 0)
 +                  | (bRerunMD ? CGLO_RERUNMD : 0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
 +
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                    constr, NULL, FALSE, state->box,
 +                    top_global, &pcurr, &bSumEkinhOld, cglo_flags);
 +    if (ir->eI == eiVVAK)
 +    {
 +        /* a second call to get the half step temperature initialized as well */
 +        /* we do the same call as above, but turn the pressure off -- internally to
 +           compute_globals, this is recognized as a velocity verlet half-step
 +           kinetic energy calculation.  This minimized excess variables, but
 +           perhaps loses some logic?*/
 +
 +        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                        constr, NULL, FALSE, state->box,
 +                        top_global, &pcurr, &bSumEkinhOld,
 +                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT))
 +    {
 +        for (i = 0; (i < ir->opts.ngtc); i++)
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 +        }
 +    }
 +    if (ir->eI != eiVV)
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterativeCase)
 +    {
 +        bufstate = init_bufstate(state);
 +    }
 +
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 +
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr, FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr, "starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf, "%s", "infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
 +                        gmx_step_str(ir->init_step, sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr, "%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps, sbuf), tbuf);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
 +    wallcycle_start(wcycle, ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret = fcCheckPointParallel( cr->nodeid,
 +                                      NULL, 0);
 +    if (chkpt_ret == 0)
 +    {
 +        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
 +    }
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv, &status,
 +                                             opt2fn("-rerun", nfile, fnm),
 +                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms, top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box, fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX    = !bStateFromCP;
 +    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep        = FALSE;
 +    bSumEkinhOld     = FALSE;
 +    bExchanged       = FALSE;
 +
 +    init_global_signals(&gs, cr, ir, repl_ex_nst);
 +
 +    step     = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh, bGStatEveryStep, step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame))
 +    {
 +
 +        wallcycle_start(wcycle, ewcSTEP);
 +
 +        if (bRerunMD)
 +        {
 +            if (rerun_fr.bStep)
 +            {
 +                step     = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime)
 +            {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        }
 +        else
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t         = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +        {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +
 +            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
 +            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
 +            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn)
 +        {
 +            update_annealing_target_temp(&(ir->opts), t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for (i = 0; i < state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box, state_global->box);
 +            copy_mat(state_global->box, state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(vsite, state->x, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS      = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        }
 +
 +        /* check whether we should stop because another simulation has
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
 +                 (multisim_nsteps != ir->nsteps) )
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr,
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep         = TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0) ||
 +             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii = bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
 +        {
 +            bBornRadii = TRUE;
 +        }
 +
 +        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +            (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog, step, state->box, graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd, state, state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle, ewcDOMDEC);
 +                dd_partition_system(fplog, step, cr,
 +                                    bMasterState, nstglobalcomm,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle,
 +                                    do_verbose && !bPMETuneRunning);
 +                wallcycle_stop(wcycle, ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log)
 +        {
 +            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +        }
 +
 +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
 +        {
 +
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                            constr, NULL, FALSE, state->box,
 +                            top_global, &pcurr, &bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
 +                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        if (EI_VV(ir->eI) && (!bInitStep))
 +        {
 +            /* for vv, the first half of the integration actually corresponds
 +               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
 +               but the virial needs to be calculated on both the current step and the 'next' step. Future
 +               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
 +
 +            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
 +        }
 +        else
 +        {
 +            bCalcEner = do_per_step(step, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 +        }
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
 +                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcVir  = TRUE;
 +            bCalcEner = TRUE;
 +            bGStat    = TRUE;
 +        }
 +
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +                      );
 +
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
 +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +                       );
 +
 +        if (fr->bTwinRange)
 +        {
 +            if (do_per_step(step, ir->nstcalclr))
 +            {
 +                force_flags |= GMX_FORCE_DO_LR;
 +            }
 +        }
 +
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count = relax_shell_flexcon(fplog, cr, bVerbose, step,
 +                                        ir, bNS, force_flags,
 +                                        top,
 +                                        constr, enerd, fcd,
 +                                        state, f, force_vir, mdatoms,
 +                                        nrnb, wcycle, graph, groups,
 +                                        shellfc, fr, bBornRadii, t, mu_tot,
 +                                        &bConverged, vsite,
 +                                        outf->fp_field);
 +            tcount += count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too.
 +             * Check comments in sim_util.c
 +             */
 +            do_force(fplog, cr, ir, step, nrnb, wcycle, top, groups,
 +                     state->box, state->x, &state->hist,
 +                     f, force_vir, mdatoms, enerd, fcd,
 +                     state->lambda, graph,
 +                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
 +                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
 +        }
 +
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
 +            fprintf(fplog, "Done init_coupling\n");
 +            fflush(fplog);
 +        }
 +
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI == eiVV && bInitStep)
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
 +            }
 +            else
 +            {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
 +            }
 +
 +            /* If we are using twin-range interactions where the long-range component
 +             * is only evaluated every nstcalclr>1 steps, we should do a special update
 +             * step to combine the long-range forces on these steps.
 +             * For nstcalclr=1 this is not done, since the forces would have been added
 +             * directly to the short-range forces already.
 +             */
 +            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
 +                          f, bUpdateDoLR, fr->f_twin, fcd,
 +                          ekind, M, upd, bInitStep, etrtVELOCITY1,
 +                          cr, nrnb, constr, &top->idef);
 +
 +            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
 +            {
 +                gmx_iterate_init(&iterate, TRUE);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +
 +            /* save the state */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +            }
 +
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || iterate.bIterationActive)
 +            {
 +                if (iterate.bIterationActive)
 +                {
 +                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +                    if (bFirstIterate && bTrotter)
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +
 +                        veta_save = state->veta;
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
 +                        vetanew     = state->veta;
 +                        state->veta = veta_save;
 +                    }
 +                }
 +
 +                bOK = TRUE;
 +                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
 +                {
 +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, shake_vir,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       TRUE, bCalcVir, vetanew);
 +
 +                    if (!bOK)
 +                    {
 +                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +
 +                }
 +                else if (graph)
 +                {
 +                    /* Need to unshift here if a do_force has been
 +                       called in the previous step */
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
 +                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
 +                /* for vv, the first half of the integration actually corresponds to the previous step.
 +                   So we need information from the last step in the first half of the integration */
 +                if (bGStat || do_per_step(step-1, nstglobalcomm))
 +                {
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, state->box,
 +                                    top_global, &pcurr, &bSumEkinhOld,
 +                                    cglo_flags
 +                                    | CGLO_ENERGY
 +                                    | (bTemp ? CGLO_TEMPERATURE : 0)
 +                                    | (bPres ? CGLO_PRESSURE : 0)
 +                                    | (bPres ? CGLO_CONSTRAINT : 0)
 +                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
 +                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                    | CGLO_SCALEEKIN
 +                                    );
 +                    /* explanation of above:
 +                       a) We compute Ekin at the full time step
 +                       if 1) we are using the AveVel Ekin, and it's not the
 +                       initial step, or 2) if we are using AveEkin, but need the full
 +                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
 +                       EkinAveVel because it's needed for the pressure */
 +                }
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep)
 +                {
 +                    if (bTrotter)
 +                    {
 +                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
 +                    }
 +                    else
 +                    {
 +                        if (bExchanged)
 +                        {
 +
 +                            /* We need the kinetic energy at minus the half step for determining
 +                             * the full step kinetic energy and possibly for T-coupling.*/
 +                            /* This may not be quite working correctly yet . . . . */
 +                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                                            constr, NULL, FALSE, state->box,
 +                                            top_global, &pcurr, &bSumEkinhOld,
 +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +                        }
 +                    }
 +                }
 +
 +                if (iterate.bIterationActive &&
 +                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                                   state->veta, &vetanew))
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep)
 +            {
 +                copy_mat(shake_vir, state->svir_prev);
 +                copy_mat(force_vir, state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
 +                {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI == eiVV)
 +            {
 +                copy_rvecn(cbuf, state->v, 0, state->natoms);
 +            }
 +        }
 +
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV)
 +        {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
 +            if (ir->eI == eiVV)
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            if (!bRerunMD)
 +            {
 +                sum_dhdl(enerd, state->lambda, ir->fepvals);
 +            }
 +        }
 +
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded)
 +        {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +
 +            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +
 +        /* Now we have the energies and forces corresponding to the
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step, ir->nstxout))
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +        if (do_per_step(step, ir->nstvout))
 +        {
 +            mdof_flags |= MDOF_V;
 +        }
 +        if (do_per_step(step, ir->nstfout))
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +        if (do_per_step(step, ir->nstxtcout))
 +        {
 +            mdof_flags |= MDOF_XTC;
 +        }
 +        if (bCPT)
 +        {
 +            mdof_flags |= MDOF_CPT;
 +        }
 +        ;
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +        {
 +            fcReportProgress( ir->nsteps, step );
 +        }
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +        {
 +            fcRequestCheckPoint();
 +        }
 +#endif
 +
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle, ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd, state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng, state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate, ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist, mdebin);
 +                    if (ir->efep != efepNO || ir->bSimTemp)
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist, &df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog, cr, outf, mdof_flags, top_global,
 +                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr, "\nWriting final coordinates.\n");
 +                if (fr->bMolPBC)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
 +                                    *top_global->name, top_global,
 +                                    state_global->x, state_global->v,
 +                                    ir->ePBC, state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle, ewcTRAJ);
 +        }
 +
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV)
 +        {
 +            copy_mat(state->svir_prev, shake_vir);
 +            copy_mat(state->fvir_prev, force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
 +            {
 +                gs.sig[eglsSTOPCOND] = 1;
 +            }
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next)
 +            {
 +                gs.sig[eglsSTOPCOND] = -1;
 +            }
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition = (int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
 +                                                     nlh.scale_tot, state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 ||
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +
 +        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
 +        if (EI_VV(ir->eI))
 +        {
 +            if (!bInitStep)
 +            {
 +                update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms);
 +            }
 +            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
 +            {
 +                gmx_bool bIfRandomize;
 +                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
 +                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +                if (constr && bIfRandomize)
 +                {
 +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       TRUE, bCalcVir, vetanew);
 +                }
 +            }
 +        }
 +
 +        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
 +        {
 +            gmx_iterate_init(&iterate, TRUE);
 +            /* for iterations, we save these vectors, as we will be redoing the calculations */
 +            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +        }
 +
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || iterate.bIterationActive)
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box, lastbox);
 +
 +            bOK         = TRUE;
 +            dvdl_constr = 0;
 +
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle, ewcUPDATE);
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter)
 +                {
 +                    if (iterate.bIterationActive)
 +                    {
 +                        if (bFirstIterate)
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir, scalevir, shake_vir);
 +                        m_add(force_vir, shake_vir, total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
 +                    /* We can only do Berendsen coupling after we have summed
 +                     * the kinetic energy or virial. Since the happens
 +                     * in global_state after update, we should only do it at
 +                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                     */
 +                }
 +                else
 +                {
 +                    update_tcouple(step, ir, state, ekind, upd, &MassQ, mdatoms);
 +                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    /* velocity half-step update */
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, upd, FALSE, etrtVELOCITY2,
 +                                  cr, nrnb, constr, &top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    copy_rvecn(state->x, cbuf, 0, state->natoms);
 +                }
 +                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                              bUpdateDoLR, fr->f_twin, fcd,
 +                              ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
 +                                   fr->bMolPBC, graph, f,
 +                                   &top->idef, shake_vir,
 +                                   cr, nrnb, wcycle, upd, constr,
 +                                   FALSE, bCalcVir, state->veta);
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, lastbox,
 +                                    top_global, &pcurr, &bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE
 +                                    );
 +                    wallcycle_start(wcycle, ewcUPDATE);
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf, state->x, 0, state->natoms);
 +
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                    wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure.
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir,
 +                                       cr, nrnb, wcycle, upd, NULL,
 +                                       FALSE, bCalcVir,
 +                                       state->veta);
 +                }
 +                if (!bOK)
 +                {
 +                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +
 +                if (fr->bSepDVDL && fplog && do_log)
 +                {
 +                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
 +                }
 +                if (bVV)
 +                {
 +                    /* this factor or 2 correction is necessary
 +                       because half of the constraint force is removed
 +                       in the vv step, so we have to double it.  See
 +                       the Redmine issue #1255.  It is not yet clear
 +                       if the factor of 2 is exact, or just a very
 +                       good approximation, and this will be
 +                       investigated.  The next step is to see if this
 +                       can be done adding a dhdl contribution from the
 +                       rattle step, but this is somewhat more
 +                       complicated with the current code. Will be
 +                       investigated, hopefully for 4.6.3. However,
 +                       this current solution is much better than
 +                       having it completely wrong.
 +                     */
 +                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
 +                }
 +                else
 +                {
 +                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +                }
 +            }
 +            else if (graph)
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph, state->box, state->x);
 +            }
 +
 +            if (vsite != NULL)
 +            {
 +                wallcycle_start(wcycle, ewcVSITECONSTR);
 +                if (graph != NULL)
 +                {
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(vsite, state->x, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +
 +                if (graph != NULL)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +                wallcycle_stop(wcycle, ewcVSITECONSTR);
 +            }
 +
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
 +            /* With Leap-Frog we can skip compute_globals at
 +             * non-communication steps, but we need to calculate
 +             * the kinetic energy one step before communication.
 +             */
 +            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
 +            {
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
 +                }
 +                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                constr,
 +                                bFirstIterate ? &gs : NULL,
 +                                (step_rel % gs.nstms == 0) &&
 +                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
 +                                lastbox,
 +                                top_global, &pcurr, &bSumEkinhOld,
 +                                cglo_flags
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
 +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
 +                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_CONSTRAINT
 +                                );
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    nlh.nabnsb         = gs.set[eglsNABNSB];
 +                    gs.set[eglsNABNSB] = 0;
 +                }
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (iterate.bIterationActive &&
 +                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                               trace(shake_vir), &tracevir))
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        if (!bVV || bRerunMD)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd, state->lambda, ir->fepvals);
 +        }
 +        update_box(fplog, step, ir, mdatoms, state, f,
 +                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, upd);
 +
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +
 +        /* The coordinates (x) were unshifted in update */
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,
 +             * so signal that we still have to do it.
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies.
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
 +                        ir, MASTER(cr),
 +                        mdatoms, &(top->idef), mu_aver,
 +                        top_global->mols.nr, cr,
 +                        state->box, total_vir, pres,
 +                        mu_tot, state->x, f, bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI == eiVV)
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize)
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max)
 +            {
 +                fprintf(stderr, "Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep)
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr, do_or;
 +
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
 +                                          &df_history, state->fep_state, ir->nstlog, step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
 +            {
 +                if (bCalcEner)
 +                {
 +                    upd_mdebin(mdebin, bDoDHDL, TRUE,
 +                               t, mdatoms->tmass, enerd, state,
 +                               ir->fepvals, ir->expandedvals, lastbox,
 +                               shake_vir, force_vir, total_vir, pres,
 +                               ekind, mu_tot, constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +
 +                do_dr  = do_per_step(step, ir->nstdisreout);
 +                do_or  = do_per_step(step, ir->nstorireout);
 +
 +                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
 +                           step, t,
 +                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull, step, t);
 +            }
 +
 +            if (do_per_step(step, ir->nstlog))
 +            {
 +                if (fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
 +        {
 +            if (shellfc)
 +            {
 +                fprintf(stderr, "\n");
 +            }
 +            print_time(stderr, runtime, step, ir, cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step, repl_ex_nst))
 +        {
 +            bExchanged = replica_exchange(fplog, cr, repl_ex,
 +                                          state_global, enerd,
 +                                          state, step, t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr))
 +            {
 +                dd_partition_system(fplog, step, cr, TRUE, 1,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle, FALSE);
 +            }
 +        }
 +
 +        bFirstStep       = FALSE;
 +        bInitStep        = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres, state->pres_prev);
 +        }
 +
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed != NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel, membed, state_global->x);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +            }
 +        }
 +
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +
 +        cycles = wallcycle_stop(wcycle, ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd, cycles, ddCyclStep);
 +        }
 +
 +        if (bPMETuneRunning || bPMETuneTry)
 +        {
 +            /* PME grid + cut-off optimization with GPUs or PME nodes */
 +
 +            /* Count the total cycles over the last steps */
 +            cycles_pmes += cycles;
 +
 +            /* We can only switch cut-off at NS steps */
 +            if (step % ir->nstlist == 0)
 +            {
 +                /* PME grid + cut-off optimization with GPUs or PME nodes */
 +                if (bPMETuneTry)
 +                {
 +                    if (DDMASTER(cr->dd))
 +                    {
 +                        /* PME node load is too high, start tuning */
 +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
 +                    }
 +                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 +
 +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
 +                    {
 +                        bPMETuneTry     = FALSE;
 +                    }
 +                }
 +                if (bPMETuneRunning)
 +                {
 +                    /* init_step might not be a multiple of nstlist,
 +                     * but the first cycle is always skipped anyhow.
 +                     */
 +                    bPMETuneRunning =
 +                        pme_load_balance(pme_loadbal, cr,
 +                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
 +                                         fplog,
 +                                         ir, state, cycles_pmes,
 +                                         fr->ic, fr->nbv, &fr->pmedata,
 +                                         step);
 +
 +                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
 +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
 +                    fr->rlist      = fr->ic->rlist;
 +                    fr->rlistlong  = fr->ic->rlistlong;
 +                    fr->rcoulomb   = fr->ic->rcoulomb;
 +                    fr->rvdw       = fr->ic->rvdw;
 +                }
 +                cycles_pmes = 0;
 +            }
 +        }
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
 +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +            wcycle_set_reset_counters(wcycle, -1);
 +            if (!(cr->duty & DUTY_PME))
 +            {
 +                /* Tell our PME node to reset its counters */
 +                gmx_pme_send_resetcounters(cr, step);
 +            }
 +            /* Correct max_hours for the elapsed time */
 +            max_hours                -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH    = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +
 +    /* Stop the time */
 +    runtime_end(runtime);
 +
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD)
 +        {
 +            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
 +                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
 +    }
 +
 +    if (pme_loadbal != NULL)
 +    {
 +        pme_loadbal_done(pme_loadbal, cr, fplog,
 +                         fr->nbv != NULL && fr->nbv->bUseGPU);
 +    }
 +
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog, repl_ex);
 +    }
 +
 +    runtime->nsteps_done = step_rel;
 +
 +    return 0;
 +}
diff --cc src/programs/mdrun/runner.c
index 1936e1b37c,0000000000..53117dcd1f
mode 100644,000000..100644
--- a/src/programs/mdrun/runner.c
+++ b/src/programs/mdrun/runner.c
@@@ -1,1668 -1,0 +1,1668 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "../mdlib/nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
 +#include "gmx_thread_affinity.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct {
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t   *hw_opt;
 +    FILE           *fplog;
 +    t_commrec      *cr;
 +    int             nfile;
 +    const t_filenm *fnm;
 +    output_env_t    oenv;
 +    gmx_bool        bVerbose;
 +    gmx_bool        bCompact;
 +    int             nstglobalcomm;
 +    ivec            ddxyz;
 +    int             dd_node_order;
 +    real            rdd;
 +    real            rconstr;
 +    const char     *dddlb_opt;
 +    real            dlb_scale;
 +    const char     *ddcsx;
 +    const char     *ddcsy;
 +    const char     *ddcsz;
 +    const char     *nbpu_opt;
 +    gmx_large_int_t nsteps_cmdline;
 +    int             nstepout;
 +    int             resetstep;
 +    int             nmultisim;
 +    int             repl_ex_nst;
 +    int             repl_ex_nex;
 +    int             repl_ex_seed;
 +    real            pforce;
 +    real            cpt_period;
 +    real            max_hours;
 +    const char     *deviceOptions;
 +    unsigned long   Flags;
 +    int             ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner()
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
 +                                            that it's thread-local. This doesn't
 +                                            copy pointed-to items, of course,
 +                                            but those are all const. */
 +    t_commrec *cr;                       /* we need a local version of this */
 +    FILE      *fplog = NULL;
 +    t_filenm  *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog = mc.fplog;
 +    }
 +
 +    mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
 +                        mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
 +                        mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                        mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
 +                        mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                        mc.nbpu_opt,
 +                        mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                        mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
 +                        mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread.
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 +                                         FILE *fplog, t_commrec *cr, int nfile,
 +                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +                                         gmx_bool bCompact, int nstglobalcomm,
 +                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
 +                                         const char *dddlb_opt, real dlb_scale,
 +                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
 +                                         const char *nbpu_opt,
 +                                         gmx_large_int_t nsteps_cmdline,
 +                                         int nstepout, int resetstep,
 +                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                                         real pforce, real cpt_period, real max_hours,
 +                                         const char *deviceOptions, unsigned long Flags)
 +{
 +    int                      ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec               *crn; /* the new commrec */
 +    t_filenm                *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda, 1);
 +    fnmn = dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt         = hw_opt;
 +    mda->fplog          = fplog;
 +    mda->cr             = cr;
 +    mda->nfile          = nfile;
 +    mda->fnm            = fnmn;
 +    mda->oenv           = oenv;
 +    mda->bVerbose       = bVerbose;
 +    mda->bCompact       = bCompact;
 +    mda->nstglobalcomm  = nstglobalcomm;
 +    mda->ddxyz[XX]      = ddxyz[XX];
 +    mda->ddxyz[YY]      = ddxyz[YY];
 +    mda->ddxyz[ZZ]      = ddxyz[ZZ];
 +    mda->dd_node_order  = dd_node_order;
 +    mda->rdd            = rdd;
 +    mda->rconstr        = rconstr;
 +    mda->dddlb_opt      = dddlb_opt;
 +    mda->dlb_scale      = dlb_scale;
 +    mda->ddcsx          = ddcsx;
 +    mda->ddcsy          = ddcsy;
 +    mda->ddcsz          = ddcsz;
 +    mda->nbpu_opt       = nbpu_opt;
 +    mda->nsteps_cmdline = nsteps_cmdline;
 +    mda->nstepout       = nstepout;
 +    mda->resetstep      = resetstep;
 +    mda->nmultisim      = nmultisim;
 +    mda->repl_ex_nst    = repl_ex_nst;
 +    mda->repl_ex_nex    = repl_ex_nex;
 +    mda->repl_ex_seed   = repl_ex_seed;
 +    mda->pforce         = pforce;
 +    mda->cpt_period     = cpt_period;
 +    mda->max_hours      = max_hours;
 +    mda->deviceOptions  = deviceOptions;
 +    mda->Flags          = Flags;
 +
 +    /* now spawn new threads that start mdrunner_start_fn(), while
 +       the main thread returns, we set thread affinity later */
 +    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
 +                       mdrunner_start_fn, (void*)(mda) );
 +    if (ret != TMPI_SUCCESS)
 +    {
 +        return NULL;
 +    }
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn = init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 +                                        const gmx_hw_opt_t  *hw_opt,
 +                                        int                  nthreads_tot,
 +                                        int                  ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        /* Here we could oversubscribe, when we do, we issue a warning later */
 +        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* In general, when running up to 4 threads, OpenMP should be faster.
 +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 +         * even on two CPUs it's usually faster (but with many OpenMP threads
 +         * it could be faster not to use HT, currently we always use HT).
 +         * On Nehalem/Westmere we want to avoid running 16 threads over
 +         * two CPUs with HT, so we need a limit<16; thus we use 12.
 +         * A reasonable limit for Intel Sandy and Ivy bridge,
 +         * not knowing the topology, is 16 threads.
 +         */
 +        const int nthreads_omp_always_faster             =  4;
 +        const int nthreads_omp_always_faster_Nehalem     = 12;
 +        const int nthreads_omp_always_faster_SandyBridge = 16;
 +        const int first_model_Nehalem                    = 0x1A;
 +        const int first_model_SandyBridge                = 0x2A;
 +        gmx_bool  bIntel_Family6;
 +
 +        bIntel_Family6 =
 +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
 +
 +        if (nthreads_tot <= nthreads_omp_always_faster ||
 +            (bIntel_Family6 &&
 +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
 +        {
 +            /* Use pure OpenMP parallelization */
 +            nthreads_tmpi = 1;
 +        }
 +        else
 +        {
 +            /* Don't use OpenMP parallelization */
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
 +    int      min_atoms_per_mpi_thread;
 +    char    *env;
 +    char     sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    nthreads_hw = hwinfo->nthreads_hw_avail;
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = nthreads_hw;
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Dims/steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        /* Avoid partial use of Hyper-Threading */
 +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 +        {
 +            nthreads_new = nthreads_hw/2;
 +        }
 +
 +        /* Avoid large prime numbers in the thread count */
 +        if (nthreads_new >= 6)
 +        {
 +            /* Use only 6,8,10 with additional factors of 2 */
 +            int fac;
 +
 +            fac = 2;
 +            while (3*fac*2 <= nthreads_new)
 +            {
 +                fac *= 2;
 +            }
 +
 +            nthreads_new = (nthreads_new/fac)*fac;
 +        }
 +        else
 +        {
 +            /* Avoid 5 */
 +            if (nthreads_new == 5)
 +            {
 +                nthreads_new = 4;
 +            }
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
 +        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp, t_commrec *cr,
 +                             t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
 +{
 +    char                  *env;
 +    int                    nstlist_orig, nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real                   rlist_inc, rlist_ok, rlist_max, rlist_new, rlist_prev;
 +    int                    i;
 +    t_state                state_tmp;
 +    gmx_bool               bBox, bDD, bCont;
 +    const char            *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char            *vbd_err  = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char            *box_err  = "Can not increase nstlist for GPU run because the box is too small";
 +    const char            *dd_err   = "Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char                   buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[] = { 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, nstl_fmt, ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n", vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n", vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n", buf);
 +        }
 +        sscanf(env, "%d", &ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE, &ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc, rlist_max);
 +    }
 +
 +    i            = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
 +                                NULL, &rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box, state_tmp.box);
 +            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont        = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox        = TRUE;
 +                bDD         = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig, ir->nstlist,
 +                ir->rlist, rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n\n", buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n\n", buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE                           *fplog,
 +                                  const gmx_hw_info_t            *hwinfo,
 +                                  t_commrec                      *cr,
 +                                  t_inputrec                     *ir,
 +                                  const gmx_mtop_t               *mtop,
 +                                  matrix                          box,
 +                                  gmx_bool                       *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real                   rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU, &ls);
 +
 +        calc_verlet_buffer_size(mtop, det(box), ir,
 +                                ir->verletbuf_drift, &ls,
 +                                NULL, &rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist, rlist_new,
 +                        ls.cluster_size_i, ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog, cr, ir, mtop, box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop, real box_vol)
 +{
 +    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE, &ls);
 +        calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
 +                                NULL, &ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw, ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 +                                    int           cutoff_scheme,
 +                                    gmx_bool      bIsSimMaster)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support");
 +    }
 +    hw_opt->nthreads_omp = 1;
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
 +
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE            *fplog,
 +                                    gmx_large_int_t  nsteps_cmdline,
 +                                    t_inputrec      *ir,
 +                                    const t_commrec *cr)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
 +                    gmx_step_str(nsteps_cmdline, sbuf),
 +                    nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
 +                    gmx_step_str(nsteps_cmdline, sbuf));
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int      cutoff_scheme; /* The cutoff scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog, t_commrec *cr, int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
 +             const char *dddlb_opt, real dlb_scale,
 +             const char *ddcsx, const char *ddcsy, const char *ddcsz,
 +             const char *nbpu_opt,
 +             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
 +             int nmultisim, int repl_ex_nst, int repl_ex_nex,
 +             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool        bForceUseGPU, bTryUseGPU;
 +    double          nodetime = 0, realtime;
 +    t_inputrec     *inputrec;
 +    t_state        *state = NULL;
 +    matrix          box;
 +    gmx_ddbox_t     ddbox = {0};
 +    int             npme_major, npme_minor;
 +    real            tmpr1, tmpr2;
 +    t_nrnb         *nrnb;
 +    gmx_mtop_t     *mtop       = NULL;
 +    t_mdatoms      *mdatoms    = NULL;
 +    t_forcerec     *fr         = NULL;
 +    t_fcdata       *fcd        = NULL;
 +    real            ewaldcoeff = 0;
 +    gmx_pme_t      *pmedata    = NULL;
 +    gmx_vsite_t    *vsite      = NULL;
 +    gmx_constr_t    constr;
 +    int             i, m, nChargePerturbed = -1, status, nalloc;
 +    char           *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool        bReadRNG, bReadEkin;
 +    int             list;
 +    gmx_runtime_t   runtime;
 +    int             rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t     ed           = NULL;
 +    t_commrec      *cr_old       = cr;
 +    int             nthreads_pme = 1;
 +    int             nthreads_pp  = 1;
 +    gmx_membed_t    membed       = NULL;
 +    gmx_hw_info_t  *hwinfo       = NULL;
 +    master_inf_t    minf         = {-1, FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec, 1);
 +    snew(mtop, 1);
 +
 +    if (Flags & MD_APPENDFILES)
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    /* Detect hardware, gather information. This is an operation that is
 +     * global for this process (MPI rank). */
 +    hwinfo = gmx_detect_hardware(fplog, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +
 +    snew(state, 1);
 +    if (SIMMASTER(cr))
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
 +        }
 +
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog, hwinfo, cr,
 +                                  inputrec, mtop, state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr, fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf), &minf, cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +    /* Check for externally set OpenMP affinity and turn off internal
 +     * pinning if any is found. We need to do this check early to tell
 +     * thread-MPI whether it should do pinning when spawning threads.
 +     * TODO: the above no longer holds, we should move these checks down
 +     */
 +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
 +
 +#ifdef GMX_THREAD_MPI
 +        /* Early check for externally set process affinity. Can't do over all
 +         * MPI processes because hwinfo is not available everywhere, but with
 +         * thread-MPI it's needed as pinning might get turned off which needs
 +         * to be known before starting thread-MPI. */
 +        gmx_check_thread_affinity_set(fplog,
 +                                      NULL,
 +                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
 +                                        oenv, bVerbose, bCompact, nstglobalcomm,
 +                                        ddxyz, dd_node_order, rdd, rconstr,
 +                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                        nbpu_opt,
 +                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                        cpt_period, max_hours, deviceOptions,
 +                                        Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed", nfile, fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Initializing membed");
 +        }
 +        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 +    }
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state, inputrec, cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +                  );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && PAR(cr))
 +    {
 +        /* Simple neighbour searching and (also?) all-vs-all loops
 +         * do not work with domain decomposition. */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS, cr, NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS, cr, NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd, 1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
 +
 +    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box, box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box), box, cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box, deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
 +                            cr, Flags & MD_PARTDEC, ddxyz,
 +                            inputrec, state, &bReadRNG, &bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
 +                     Flags, &fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr))
 +    {
 +        copy_mat(state->box, box);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_bcast(sizeof(box), box, cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei", nfile, fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
 +                                           dddlb_opt, dlb_scale,
 +                                           ddcsx, ddcsy, ddcsz,
 +                                           mtop, inputrec,
 +                                           box, state->x,
 +                                           &ddbox, &npme_major, &npme_minor);
 +
 +        make_dd_communicators(fplog, cr, dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty      = (DUTY_PP | DUTY_PME);
 +        npme_major    = 1;
 +        npme_minor    = 1;
 +        /* NM and TPI perform single node energy calculations in parallel */
 +        if (!(inputrec->eI == eiNM || EI_TPI(inputrec->eI)))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog, cr);
 +    }
 +
 +    /* Initialize per-physical-node MPI process/thread ID and counters. */
 +    gmx_init_intranode_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr, fplog, "Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes == 1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes == 1 ? "process" : "processes"
 +#endif
 +                  );
 +    fflush(stderr);
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    /* check consistency and decide on the number of gpus to use. */
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
 +                                     minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb, 1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr, state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr, state, TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr         = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
 +                      opt2fn("-table", nfile, fnm),
 +                      opt2fn("-tabletf", nfile, fnm),
 +                      opt2fn("-tablep", nfile, fnm),
 +                      opt2fn("-tableb", nfile, fnm),
 +                      nbpu_opt,
 +                      FALSE, pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +           "nofile","nofile","nofile","nofile",FALSE,pforce);
 +         */
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if (fr->bQMMM)
 +        {
 +            init_QMMMrec(cr, mtop, inputrec, fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
 +
 +        if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
 +        }
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop, cr, FALSE);
 +
 +        calc_shifts(box, fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(vsite, mtop, state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata    = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata, 1);
 +    }
 +
 +    if (hw_opt->thread_affinity != threadaffOFF)
 +    {
 +        /* Before setting affinity, check whether the affinity has changed
 +         * - which indicates that probably the OpenMP library has changed it
 +         * since we first checked).
 +         */
 +        gmx_check_thread_affinity_set(fplog, cr,
 +                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 +
 +        /* Set the CPU affinity */
 +        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
 +                                  mtop ? mtop->natoms : 0, nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
 +            if (status != 0)
 +            {
 +                gmx_fatal(FARGS, "Error %d initializing PME", status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md)
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
 +        }
 +
 +        if (inputrec->bRot)
 +        {
 +            /* Initialize enforced rotation code */
 +            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
 +                     bVerbose, Flags);
 +        }
 +
 +        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
 +                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
 +
 +            setup_dd_grid(fplog, cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
 +                                      oenv, bVerbose, bCompact,
 +                                      nstglobalcomm,
 +                                      vsite, constr,
 +                                      nstepout, inputrec, mtop,
 +                                      fcd, state,
 +                                      mdatoms, nrnb, wcycle, ed, fr,
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
 +                                      membed,
 +                                      cpt_period, max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(inputrec->pull);
 +        }
 +
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(inputrec->rot);
 +        }
 +
 +    }
 +    else
 +    {
 +        /* do PME only */
-         gmx_pmeonly(*pmedata, cr, nrnb, wcycle, ewaldcoeff, inputrec);
++        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, &runtime, ewaldcoeff, inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again
 +     */
 +    finish_run(fplog, cr,
 +               inputrec, nrnb, wcycle, &runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed", nfile, fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +    gmx_hardware_info_free(hwinfo);
 +
 +    /* Does what it says */
 +    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", &runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    }
 +
 +    rc = (int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}