From: Roland Schulz <roland@utk.edu>
Date: Thu, 27 Jun 2013 07:57:35 +0000 (-0400)
Subject: Merge branch 'release-4-6' into master
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=cc6daae88401ad68b564de821490da31d49331c2;p=alexxy%2Fgromacs.git

Merge branch 'release-4-6' into master

Conflicts:
	CMakeLists.txt
	cmake/ThreadMPI.cmake
	src/gromacs/gmxana/calcpot.c
	src/gromacs/gmxana/calcpot.h
	src/gromacs/legacyheaders/pull_rotation.h
	src/tools/CMakeLists.txt

Resolution was straightforward; always in favour of version already in
master branch. Removed calcpot.[ch].

Change-Id: I7ad7a6d9e34f30e04f71c52d707065c6e14b68f3
---

cc6daae88401ad68b564de821490da31d49331c2
diff --cc src/gromacs/gmxana/gmx_bar.c
index 45ae72b224,0000000000..2fa8ecbde3
mode 100644,000000..100644
--- a/src/gromacs/gmxana/gmx_bar.c
+++ b/src/gromacs/gmxana/gmx_bar.c
@@@ -1,3953 -1,0 +1,3950 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Green Red Orange Magenta Azure Cyan Skyblue
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <ctype.h>
 +#include <math.h>
 +#include <float.h>
 +
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "statutil.h"
 +#include "copyrite.h"
 +#include "macros.h"
 +#include "enxio.h"
 +#include "physics.h"
 +#include "gmx_fatal.h"
 +#include "xvgr.h"
 +#include "gmx_ana.h"
 +#include "maths.h"
 +#include "string2.h"
 +#include "names.h"
 +#include "mdebin.h"
 +
 +
 +/* Structure for the names of lambda vector components */
 +typedef struct lambda_components_t
 +{
 +    char **names;  /* Array of strings with names for the lambda vector
 +                      components */
 +    int    N;      /* The number of components */
 +    int    Nalloc; /* The number of allocated components */
 +} lambda_components_t;
 +
 +/* Structure for a lambda vector or a dhdl derivative direction */
 +typedef struct lambda_vec_t
 +{
 +    double                    *val;   /* The lambda vector component values. Only valid if
 +                                         dhdl == -1 */
 +    int                        dhdl;  /* The coordinate index for the derivative described by this
 +                                         structure, or -1 */
 +    const lambda_components_t *lc;    /* the associated lambda_components
 +                                         structure */
 +    int                        index; /* The state number (init-lambda-state) of this lambda
 +                                         vector, if known. If not, it is set to -1 */
 +} lambda_vec_t;
 +
 +/* the dhdl.xvg data from a simulation */
 +typedef struct xvg_t
 +{
 +    const char   *filename;
 +    int           ftp;           /* file type */
 +    int           nset;          /* number of lambdas, including dhdl */
 +    int          *np;            /* number of data points (du or hists) per lambda */
 +    int           np_alloc;      /* number of points (du or hists) allocated */
 +    double        temp;          /* temperature */
 +    lambda_vec_t *lambda;        /* the lambdas (of first index for y). */
 +    double       *t;             /* the times (of second index for y) */
 +    double      **y;             /* the dU values. y[0] holds the derivative, while
 +                                    further ones contain the energy differences between
 +                                    the native lambda and the 'foreign' lambdas. */
 +    lambda_vec_t  native_lambda; /* the native lambda */
 +
 +    struct xvg_t *next, *prev;   /*location in the global linked list of xvg_ts*/
 +} xvg_t;
 +
 +
 +typedef struct hist_t
 +{
 +    unsigned int   *bin[2];                 /* the (forward + reverse) histogram values */
 +    double          dx[2];                  /* the histogram spacing. The reverse
 +                                               dx is the negative of the forward dx.*/
 +    gmx_large_int_t x0[2];                  /* the (forward + reverse) histogram start
 +                                               point(s) as int */
 +
 +    int             nbin[2];                /* the (forward+reverse) number of bins */
 +    gmx_large_int_t sum;                    /* the total number of counts. Must be
 +                                               the same for forward + reverse.  */
 +    int             nhist;                  /* number of hist datas (forward or reverse) */
 +
 +    double          start_time, delta_time; /* start time, end time of histogram */
 +} hist_t;
 +
 +
 +/* an aggregate of samples for partial free energy calculation */
 +typedef struct samples_t
 +{
 +    lambda_vec_t *native_lambda;  /* pointer to native lambda vector */
 +    lambda_vec_t *foreign_lambda; /* pointer to foreign lambda vector */
 +    double        temp;           /* the temperature */
 +    gmx_bool      derivative;     /* whether this sample is a derivative */
 +
 +    /* The samples come either as either delta U lists: */
 +    int     ndu;                    /* the number of delta U samples */
 +    double *du;                     /* the delta u's */
 +    double *t;                      /* the times associated with those samples, or: */
 +    double  start_time, delta_time; /*start time and delta time for linear time*/
 +
 +    /* or as histograms: */
 +    hist_t *hist; /* a histogram */
 +
 +    /* allocation data: (not NULL for data 'owned' by this struct) */
 +    double         *du_alloc, *t_alloc;  /* allocated delta u arrays  */
 +    size_t          ndu_alloc, nt_alloc; /* pre-allocated sizes */
 +    hist_t         *hist_alloc;          /* allocated hist */
 +
 +    gmx_large_int_t ntot;                /* total number of samples */
 +    const char     *filename;            /* the file name this sample comes from */
 +} samples_t;
 +
 +/* a sample range (start to end for du-style data, or boolean
 +    for both du-style data and histograms */
 +typedef struct sample_range_t
 +{
 +    int        start, end; /* start and end index for du style data */
 +    gmx_bool   use;        /* whether to use this sample */
 +
 +    samples_t *s;          /* the samples this range belongs to */
 +} sample_range_t;
 +
 +
 +/* a collection of samples for a partial free energy calculation
 +    (i.e. the collection of samples from one native lambda to one
 +    foreign lambda) */
 +typedef struct sample_coll_t
 +{
 +    lambda_vec_t   *native_lambda;     /* these should be the same for all samples
 +                                          in the histogram */
 +    lambda_vec_t   *foreign_lambda;    /* collection */
 +    double          temp;              /* the temperature */
 +
 +    int             nsamples;          /* the number of samples */
 +    samples_t     **s;                 /* the samples themselves */
 +    sample_range_t *r;                 /* the sample ranges */
 +    int             nsamples_alloc;    /* number of allocated samples */
 +
 +    gmx_large_int_t ntot;              /* total number of samples in the ranges of
 +                                          this collection */
 +
 +    struct sample_coll_t *next, *prev; /* next and previous in the list */
 +} sample_coll_t;
 +
 +/* all the samples associated with a lambda point */
 +typedef struct lambda_data_t
 +{
 +    lambda_vec_t         *lambda;      /* the native lambda (at start time if dynamic) */
 +    double                temp;        /* temperature */
 +
 +    sample_coll_t        *sc;          /* the samples */
 +
 +    sample_coll_t         sc_head;     /*the pre-allocated list head for the linked list.*/
 +
 +    struct lambda_data_t *next, *prev; /* the next and prev in the list */
 +} lambda_data_t;
 +
 +/* Top-level data structure of simulation data */
 +typedef struct sim_data_t
 +{
 +    lambda_data_t      *lb;      /* a lambda data linked list */
 +    lambda_data_t       lb_head; /* The head element of the linked list */
 +
 +    lambda_components_t lc;      /* the allowed components of the lambda
 +                                    vectors */
 +} sim_data_t;
 +
 +/* Top-level data structure with calculated values. */
 +typedef struct {
 +    sample_coll_t *a, *b;            /* the simulation data */
 +
 +    double         dg;               /* the free energy difference */
 +    double         dg_err;           /* the free energy difference */
 +
 +    double         dg_disc_err;      /* discretization error */
 +    double         dg_histrange_err; /* histogram range error */
 +
 +    double         sa;               /* relative entropy of b in state a */
 +    double         sa_err;           /* error in sa */
 +    double         sb;               /* relative entropy of a in state b */
 +    double         sb_err;           /* error in sb */
 +
 +    double         dg_stddev;        /* expected dg stddev per sample */
 +    double         dg_stddev_err;    /* error in dg_stddev */
 +} barres_t;
 +
 +
 +/* Initialize a lambda_components structure */
 +static void lambda_components_init(lambda_components_t *lc)
 +{
 +    lc->N      = 0;
 +    lc->Nalloc = 2;
 +    snew(lc->names, lc->Nalloc);
 +}
 +
 +/* Add a component to a lambda_components structure */
 +static void lambda_components_add(lambda_components_t *lc,
 +                                  const char *name, size_t name_length)
 +{
 +    while (lc->N + 1 > lc->Nalloc)
 +    {
 +        lc->Nalloc = (lc->Nalloc == 0) ? 2 : 2*lc->Nalloc;
 +        srealloc( lc->names, lc->Nalloc );
 +    }
 +    snew(lc->names[lc->N], name_length+1);
 +    strncpy(lc->names[lc->N], name, name_length);
 +    lc->N++;
 +}
 +
 +/* check whether a component with index 'index' matches the given name, or
 +   is also NULL. Returns TRUE if this is the case.
 +   the string name does not need to end */
 +static gmx_bool lambda_components_check(const lambda_components_t *lc,
 +                                        int                        index,
 +                                        const char                *name,
 +                                        size_t                     name_length)
 +{
 +    size_t len;
 +    if (index >= lc->N)
 +    {
 +        return FALSE;
 +    }
 +    if (name == NULL && lc->names[index] == NULL)
 +    {
 +        return TRUE;
 +    }
 +    if ((name == NULL) != (lc->names[index] == NULL))
 +    {
 +        return FALSE;
 +    }
 +    len = strlen(lc->names[index]);
 +    if (len != name_length)
 +    {
 +        return FALSE;
 +    }
 +    if (strncmp(lc->names[index], name, name_length) == 0)
 +    {
 +        return TRUE;
 +    }
 +    return FALSE;
 +}
 +
 +/* Find the index of a given lambda component name, or -1 if not found */
 +static int lambda_components_find(const lambda_components_t *lc,
 +                                  const char                *name,
 +                                  size_t                     name_length)
 +{
 +    int i;
 +
 +    for (i = 0; i < lc->N; i++)
 +    {
 +        if (strncmp(lc->names[i], name, name_length) == 0)
 +        {
 +            return i;
 +        }
 +    }
 +    return -1;
 +}
 +
 +
 +
 +/* initialize a lambda vector */
 +static void lambda_vec_init(lambda_vec_t *lv, const lambda_components_t *lc)
 +{
 +    snew(lv->val, lc->N);
 +    lv->index = -1;
 +    lv->dhdl  = -1;
 +    lv->lc    = lc;
 +}
 +
 +static void lambda_vec_destroy(lambda_vec_t *lv)
 +{
 +    sfree(lv->val);
 +}
 +
 +static void lambda_vec_copy(lambda_vec_t *lv, const lambda_vec_t *orig)
 +{
 +    int i;
 +
 +    lambda_vec_init(lv, orig->lc);
 +    lv->dhdl  = orig->dhdl;
 +    lv->index = orig->index;
 +    for (i = 0; i < lv->lc->N; i++)
 +    {
 +        lv->val[i] = orig->val[i];
 +    }
 +}
 +
 +/* write a lambda vec to a preallocated string */
 +static void lambda_vec_print(const lambda_vec_t *lv, char *str, gmx_bool named)
 +{
 +    int    i;
 +    size_t np;
 +
 +    str[0] = 0; /* reset the string */
 +    if (lv->dhdl < 0)
 +    {
 +        if (named)
 +        {
 +            str += sprintf(str, "delta H to ");
 +        }
 +        if (lv->lc->N > 1)
 +        {
 +            str += sprintf(str, "(");
 +        }
 +        for (i = 0; i < lv->lc->N; i++)
 +        {
 +            str += sprintf(str, "%g", lv->val[i]);
 +            if (i < lv->lc->N-1)
 +            {
 +                str += sprintf(str, ", ");
 +            }
 +        }
 +        if (lv->lc->N > 1)
 +        {
 +            str += sprintf(str, ")");
 +        }
 +    }
 +    else
 +    {
 +        /* this lambda vector describes a derivative */
 +        str += sprintf(str, "dH/dl");
 +        if (strlen(lv->lc->names[lv->dhdl]) > 0)
 +        {
 +            str += sprintf(str, " (%s)", lv->lc->names[lv->dhdl]);
 +        }
 +    }
 +}
 +
 +/* write a shortened version of the lambda vec to a preallocated string */
 +static void lambda_vec_print_short(const lambda_vec_t *lv, char *str)
 +{
 +    int    i;
 +    size_t np;
 +
 +    if (lv->index >= 0)
 +    {
 +        sprintf(str, "%6d", lv->index);
 +    }
 +    else
 +    {
 +        if (lv->dhdl < 0)
 +        {
 +            sprintf(str, "%6.3f", lv->val[0]);
 +        }
 +        else
 +        {
 +            sprintf(str, "dH/dl[%d]", lv->dhdl);
 +        }
 +    }
 +}
 +
 +/* write an intermediate version of two lambda vecs to a preallocated string */
 +static void lambda_vec_print_intermediate(const lambda_vec_t *a,
 +                                          const lambda_vec_t *b, char *str)
 +{
 +    int    i;
 +    size_t np;
 +
 +    str[0] = 0;
 +    if ( (a->index >= 0) && (b->index >= 0) )
 +    {
 +        sprintf(str, "%6.3f", ((double)a->index+(double)b->index)/2.);
 +    }
 +    else
 +    {
 +        if ( (a->dhdl < 0) && (b->dhdl < 0) )
 +        {
 +            sprintf(str, "%6.3f", (a->val[0]+b->val[0])/2.);
 +        }
 +    }
 +}
 +
 +
 +
 +/* calculate the difference in lambda vectors: c = a-b.
 +   c must be initialized already, and a and b must describe non-derivative
 +   lambda points */
 +static void lambda_vec_diff(const lambda_vec_t *a, const lambda_vec_t *b,
 +                            lambda_vec_t *c)
 +{
 +    int i;
 +
 +    if ( (a->dhdl > 0)  || (b->dhdl > 0) )
 +    {
 +        gmx_fatal(FARGS,
 +                  "Trying to calculate the difference between derivatives instead of lambda points");
 +    }
 +    if ((a->lc != b->lc) || (a->lc != c->lc) )
 +    {
 +        gmx_fatal(FARGS,
 +                  "Trying to calculate the difference lambdas with differing basis set");
 +    }
 +    for (i = 0; i < a->lc->N; i++)
 +    {
 +        c->val[i] = a->val[i] - b->val[i];
 +    }
 +}
 +
 +/* calculate and return the absolute difference in lambda vectors: c = |a-b|.
 +   a and b must describe non-derivative lambda points */
 +static double lambda_vec_abs_diff(const lambda_vec_t *a, const lambda_vec_t *b)
 +{
 +    int    i;
 +    double ret = 0.;
 +
 +    if ( (a->dhdl > 0)  || (b->dhdl > 0) )
 +    {
 +        gmx_fatal(FARGS,
 +                  "Trying to calculate the difference between derivatives instead of lambda points");
 +    }
 +    if (a->lc != b->lc)
 +    {
 +        gmx_fatal(FARGS,
 +                  "Trying to calculate the difference lambdas with differing basis set");
 +    }
 +    for (i = 0; i < a->lc->N; i++)
 +    {
 +        double df = a->val[i] - b->val[i];
 +        ret += df*df;
 +    }
 +    return sqrt(ret);
 +}
 +
 +
 +/* check whether two lambda vectors are the same */
 +static gmx_bool lambda_vec_same(const lambda_vec_t *a, const lambda_vec_t *b)
 +{
 +    int i;
 +
 +    if (a->lc != b->lc)
 +    {
 +        return FALSE;
 +    }
 +    if (a->dhdl < 0)
 +    {
 +        for (i = 0; i < a->lc->N; i++)
 +        {
 +            if (!gmx_within_tol(a->val[i], b->val[i], 10*GMX_REAL_EPS))
 +            {
 +                return FALSE;
 +            }
 +        }
 +        return TRUE;
 +    }
 +    else
 +    {
 +        /* they're derivatives, so we check whether the indices match */
 +        return (a->dhdl == b->dhdl);
 +    }
 +}
 +
 +/* Compare the sort order of two foreign lambda vectors
 +
 +    returns 1 if a is 'bigger' than b,
 +    returns 0 if they're the same,
 +    returns -1 if a is 'smaller' than b.*/
 +static gmx_bool lambda_vec_cmp_foreign(const lambda_vec_t *a,
 +                                       const lambda_vec_t *b)
 +{
 +    int      i;
 +    double   norm_a    = 0, norm_b = 0;
 +    gmx_bool different = FALSE;
 +
 +    if (a->lc != b->lc)
 +    {
 +        gmx_fatal(FARGS, "Can't compare lambdas with differing basis sets");
 +    }
 +    /* if either one has an index we sort based on that */
 +    if ((a->index >= 0) || (b->index >= 0))
 +    {
 +        if (a->index == b->index)
 +        {
 +            return 0;
 +        }
 +        return (a->index > b->index) ? 1 : -1;
 +    }
 +    if (a->dhdl >= 0 || b->dhdl >= 0)
 +    {
 +        /* lambda vectors that are derivatives always sort higher than those
 +           without derivatives */
 +        if ((a->dhdl >= 0)  != (b->dhdl >= 0) )
 +        {
 +            return (a->dhdl >= 0) ? 1 : -1;
 +        }
 +        return a->dhdl > b->dhdl;
 +    }
 +
 +    /* neither has an index, so we can only sort on the lambda components,
 +       which is only valid if there is one component */
 +    for (i = 0; i < a->lc->N; i++)
 +    {
 +        if (!gmx_within_tol(a->val[i], b->val[i], 10*GMX_REAL_EPS))
 +        {
 +            different = TRUE;
 +        }
 +        norm_a += a->val[i]*a->val[i];
 +        norm_b += b->val[i]*b->val[i];
 +    }
 +    if (!different)
 +    {
 +        return 0;
 +    }
 +    return norm_a > norm_b;
 +}
 +
 +/* Compare the sort order of two native lambda vectors
 +
 +    returns 1 if a is 'bigger' than b,
 +    returns 0 if they're the same,
 +    returns -1 if a is 'smaller' than b.*/
 +static gmx_bool lambda_vec_cmp_native(const lambda_vec_t *a,
 +                                      const lambda_vec_t *b)
 +{
 +    int i;
 +
 +    if (a->lc != b->lc)
 +    {
 +        gmx_fatal(FARGS, "Can't compare lambdas with differing basis sets");
 +    }
 +    /* if either one has an index we sort based on that */
 +    if ((a->index >= 0) || (b->index >= 0))
 +    {
 +        if (a->index == b->index)
 +        {
 +            return 0;
 +        }
 +        return (a->index > b->index) ? 1 : -1;
 +    }
 +    /* neither has an index, so we can only sort on the lambda components,
 +       which is only valid if there is one component */
 +    if (a->lc->N > 1)
 +    {
 +        gmx_fatal(FARGS,
 +                  "Can't compare lambdas with no index and > 1 component");
 +    }
 +    if (a->dhdl >= 0 || b->dhdl >= 0)
 +    {
 +        gmx_fatal(FARGS,
 +                  "Can't compare native lambdas that are derivatives");
 +    }
 +    if (gmx_within_tol(a->val[0], b->val[0], 10*GMX_REAL_EPS))
 +    {
 +        return 0;
 +    }
 +    return a->val[0] > b->val[0] ? 1 : -1;
 +}
 +
 +
 +
 +
 +static void hist_init(hist_t *h, int nhist, int *nbin)
 +{
 +    int i;
 +    if (nhist > 2)
 +    {
 +        gmx_fatal(FARGS, "histogram with more than two sets of data!");
 +    }
 +    for (i = 0; i < nhist; i++)
 +    {
 +        snew(h->bin[i], nbin[i]);
 +        h->x0[i]      = 0;
 +        h->nbin[i]    = nbin[i];
 +        h->start_time = h->delta_time = 0;
 +        h->dx[i]      = 0;
 +    }
 +    h->sum   = 0;
 +    h->nhist = nhist;
 +}
 +
 +static void hist_destroy(hist_t *h)
 +{
 +    sfree(h->bin);
 +}
 +
 +
 +static void xvg_init(xvg_t *ba)
 +{
 +    ba->filename = NULL;
 +    ba->nset     = 0;
 +    ba->np_alloc = 0;
 +    ba->np       = NULL;
 +    ba->y        = NULL;
 +}
 +
 +static void samples_init(samples_t *s, lambda_vec_t *native_lambda,
 +                         lambda_vec_t *foreign_lambda, double temp,
 +                         gmx_bool derivative, const char *filename)
 +{
 +    s->native_lambda  = native_lambda;
 +    s->foreign_lambda = foreign_lambda;
 +    s->temp           = temp;
 +    s->derivative     = derivative;
 +
 +    s->ndu        = 0;
 +    s->du         = NULL;
 +    s->t          = NULL;
 +    s->start_time = s->delta_time = 0;
 +    s->hist       = NULL;
 +    s->du_alloc   = NULL;
 +    s->t_alloc    = NULL;
 +    s->hist_alloc = NULL;
 +    s->ndu_alloc  = 0;
 +    s->nt_alloc   = 0;
 +
 +    s->ntot     = 0;
 +    s->filename = filename;
 +}
 +
 +static void sample_range_init(sample_range_t *r, samples_t *s)
 +{
 +    r->start = 0;
 +    r->end   = s->ndu;
 +    r->use   = TRUE;
 +    r->s     = NULL;
 +}
 +
 +static void sample_coll_init(sample_coll_t *sc, lambda_vec_t *native_lambda,
 +                             lambda_vec_t *foreign_lambda, double temp)
 +{
 +    sc->native_lambda  = native_lambda;
 +    sc->foreign_lambda = foreign_lambda;
 +    sc->temp           = temp;
 +
 +    sc->nsamples       = 0;
 +    sc->s              = NULL;
 +    sc->r              = NULL;
 +    sc->nsamples_alloc = 0;
 +
 +    sc->ntot = 0;
 +    sc->next = sc->prev = NULL;
 +}
 +
 +static void sample_coll_destroy(sample_coll_t *sc)
 +{
 +    /* don't free the samples themselves */
 +    sfree(sc->r);
 +    sfree(sc->s);
 +}
 +
 +
 +static void lambda_data_init(lambda_data_t *l, lambda_vec_t *native_lambda,
 +                             double temp)
 +{
 +    l->lambda = native_lambda;
 +    l->temp   = temp;
 +
 +    l->next = NULL;
 +    l->prev = NULL;
 +
 +    l->sc = &(l->sc_head);
 +
 +    sample_coll_init(l->sc, native_lambda, NULL, 0.);
 +    l->sc->next = l->sc;
 +    l->sc->prev = l->sc;
 +}
 +
 +static void barres_init(barres_t *br)
 +{
 +    br->dg            = 0;
 +    br->dg_err        = 0;
 +    br->sa            = 0;
 +    br->sa_err        = 0;
 +    br->sb            = 0;
 +    br->sb_err        = 0;
 +    br->dg_stddev     = 0;
 +    br->dg_stddev_err = 0;
 +
 +    br->a = NULL;
 +    br->b = NULL;
 +}
 +
 +
 +/* calculate the total number of samples in a sample collection */
 +static void sample_coll_calc_ntot(sample_coll_t *sc)
 +{
 +    int i;
 +
 +    sc->ntot = 0;
 +    for (i = 0; i < sc->nsamples; i++)
 +    {
 +        if (sc->r[i].use)
 +        {
 +            if (sc->s[i]->hist)
 +            {
 +                sc->ntot += sc->s[i]->ntot;
 +            }
 +            else
 +            {
 +                sc->ntot += sc->r[i].end - sc->r[i].start;
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* find the barsamples_t associated with a lambda that corresponds to
 +   a specific foreign lambda */
 +static sample_coll_t *lambda_data_find_sample_coll(lambda_data_t *l,
 +                                                   lambda_vec_t  *foreign_lambda)
 +{
 +    sample_coll_t *sc = l->sc->next;
 +
 +    while (sc != l->sc)
 +    {
 +        if (lambda_vec_same(sc->foreign_lambda, foreign_lambda))
 +        {
 +            return sc;
 +        }
 +        sc = sc->next;
 +    }
 +
 +    return NULL;
 +}
 +
 +/* insert li into an ordered list of lambda_colls */
 +static void lambda_data_insert_sample_coll(lambda_data_t *l, sample_coll_t *sc)
 +{
 +    sample_coll_t *scn = l->sc->next;
 +    while ( (scn != l->sc) )
 +    {
 +        if (lambda_vec_cmp_foreign(scn->foreign_lambda, sc->foreign_lambda) > 0)
 +        {
 +            break;
 +        }
 +        scn = scn->next;
 +    }
 +    /* now insert it before the found scn */
 +    sc->next        = scn;
 +    sc->prev        = scn->prev;
 +    scn->prev->next = sc;
 +    scn->prev       = sc;
 +}
 +
 +/* insert li into an ordered list of lambdas */
 +static void lambda_data_insert_lambda(lambda_data_t *head, lambda_data_t *li)
 +{
 +    lambda_data_t *lc = head->next;
 +    while (lc != head)
 +    {
 +        if (lambda_vec_cmp_native(lc->lambda, li->lambda) > 0)
 +        {
 +            break;
 +        }
 +        lc = lc->next;
 +    }
 +    /* now insert ourselves before the found lc */
 +    li->next       = lc;
 +    li->prev       = lc->prev;
 +    lc->prev->next = li;
 +    lc->prev       = li;
 +}
 +
 +/* insert a sample and a sample_range into a sample_coll. The
 +    samples are stored as a pointer, the range is copied. */
 +static void sample_coll_insert_sample(sample_coll_t *sc, samples_t *s,
 +                                      sample_range_t *r)
 +{
 +    /* first check if it belongs here */
 +    if (sc->temp != s->temp)
 +    {
 +        gmx_fatal(FARGS, "Temperatures in files %s and %s are not the same!",
 +                  s->filename, sc->next->s[0]->filename);
 +    }
 +    if (!lambda_vec_same(sc->native_lambda, s->native_lambda))
 +    {
 +        gmx_fatal(FARGS, "Native lambda in files %s and %s are not the same (and they should be)!",
 +                  s->filename, sc->next->s[0]->filename);
 +    }
 +    if (!lambda_vec_same(sc->foreign_lambda, s->foreign_lambda))
 +    {
 +        gmx_fatal(FARGS, "Foreign lambda in files %s and %s are not the same (and they should be)!",
 +                  s->filename, sc->next->s[0]->filename);
 +    }
 +
 +    /* check if there's room */
 +    if ( (sc->nsamples + 1) > sc->nsamples_alloc)
 +    {
 +        sc->nsamples_alloc = max(2*sc->nsamples_alloc, 2);
 +        srenew(sc->s, sc->nsamples_alloc);
 +        srenew(sc->r, sc->nsamples_alloc);
 +    }
 +    sc->s[sc->nsamples] = s;
 +    sc->r[sc->nsamples] = *r;
 +    sc->nsamples++;
 +
 +    sample_coll_calc_ntot(sc);
 +}
 +
 +/* insert a sample into a lambda_list, creating the right sample_coll if
 +   neccesary */
 +static void lambda_data_list_insert_sample(lambda_data_t *head, samples_t *s)
 +{
 +    gmx_bool       found = FALSE;
 +    sample_coll_t *sc;
 +    sample_range_t r;
 +
 +    lambda_data_t *l = head->next;
 +
 +    /* first search for the right lambda_data_t */
 +    while (l != head)
 +    {
 +        if (lambda_vec_same(l->lambda, s->native_lambda) )
 +        {
 +            found = TRUE;
 +            break;
 +        }
 +        l = l->next;
 +    }
 +
 +    if (!found)
 +    {
 +        snew(l, 1);                                     /* allocate a new one */
 +        lambda_data_init(l, s->native_lambda, s->temp); /* initialize it */
 +        lambda_data_insert_lambda(head, l);             /* add it to the list */
 +    }
 +
 +    /* now look for a sample collection */
 +    sc = lambda_data_find_sample_coll(l, s->foreign_lambda);
 +    if (!sc)
 +    {
 +        snew(sc, 1); /* allocate a new one */
 +        sample_coll_init(sc, s->native_lambda, s->foreign_lambda, s->temp);
 +        lambda_data_insert_sample_coll(l, sc);
 +    }
 +
 +    /* now insert the samples into the sample coll */
 +    sample_range_init(&r, s);
 +    sample_coll_insert_sample(sc, s, &r);
 +}
 +
 +
 +/* make a histogram out of a sample collection */
 +static void sample_coll_make_hist(sample_coll_t *sc, int **bin,
 +                                  int *nbin_alloc, int *nbin,
 +                                  double *dx, double *xmin, int nbin_default)
 +{
 +    int      i, j, k;
 +    gmx_bool dx_set   = FALSE;
 +    gmx_bool xmin_set = FALSE;
 +
 +    gmx_bool xmax_set      = FALSE;
 +    gmx_bool xmax_set_hard = FALSE; /* whether the xmax is bounded by the
 +                                       limits of a histogram */
 +    double   xmax          = -1;
 +
 +    /* first determine dx and xmin; try the histograms */
 +    for (i = 0; i < sc->nsamples; i++)
 +    {
 +        if (sc->s[i]->hist)
 +        {
 +            hist_t *hist = sc->s[i]->hist;
 +            for (k = 0; k < hist->nhist; k++)
 +            {
 +                double hdx      = hist->dx[k];
 +                double xmax_now = (hist->x0[k]+hist->nbin[k])*hdx;
 +
 +                /* we use the biggest dx*/
 +                if ( (!dx_set) || hist->dx[0] > *dx)
 +                {
 +                    dx_set = TRUE;
 +                    *dx    = hist->dx[0];
 +                }
 +                if ( (!xmin_set) || (hist->x0[k]*hdx) < *xmin)
 +                {
 +                    xmin_set = TRUE;
 +                    *xmin    = (hist->x0[k]*hdx);
 +                }
 +
 +                if ( (!xmax_set) || (xmax_now > xmax && !xmax_set_hard) )
 +                {
 +                    xmax_set = TRUE;
 +                    xmax     = xmax_now;
 +                    if (hist->bin[k][hist->nbin[k]-1] != 0)
 +                    {
 +                        xmax_set_hard = TRUE;
 +                    }
 +                }
 +                if (hist->bin[k][hist->nbin[k]-1] != 0 && (xmax_now < xmax) )
 +                {
 +                    xmax_set_hard = TRUE;
 +                    xmax          = xmax_now;
 +                }
 +            }
 +        }
 +    }
 +    /* and the delta us */
 +    for (i = 0; i < sc->nsamples; i++)
 +    {
 +        if (sc->s[i]->ndu > 0)
 +        {
 +            /* determine min and max */
 +            int    starti  = sc->r[i].start;
 +            int    endi    = sc->r[i].end;
 +            double du_xmin = sc->s[i]->du[starti];
 +            double du_xmax = sc->s[i]->du[starti];
 +            for (j = starti+1; j < endi; j++)
 +            {
 +                if (sc->s[i]->du[j] < du_xmin)
 +                {
 +                    du_xmin = sc->s[i]->du[j];
 +                }
 +                if (sc->s[i]->du[j] > du_xmax)
 +                {
 +                    du_xmax = sc->s[i]->du[j];
 +                }
 +            }
 +
 +            /* and now change the limits */
 +            if ( (!xmin_set) || (du_xmin < *xmin) )
 +            {
 +                xmin_set = TRUE;
 +                *xmin    = du_xmin;
 +            }
 +            if ( (!xmax_set) || ((du_xmax > xmax) &&  !xmax_set_hard) )
 +            {
 +                xmax_set = TRUE;
 +                xmax     = du_xmax;
 +            }
 +        }
 +    }
 +
 +    if (!xmax_set || !xmin_set)
 +    {
 +        *nbin = 0;
 +        return;
 +    }
 +
 +
 +    if (!dx_set)
 +    {
 +        *nbin = nbin_default;
 +        *dx   = (xmax-(*xmin))/((*nbin)-2); /* -2 because we want the last bin to
 +                                               be 0, and we count from 0 */
 +    }
 +    else
 +    {
 +        *nbin = (xmax-(*xmin))/(*dx);
 +    }
 +
 +    if (*nbin > *nbin_alloc)
 +    {
 +        *nbin_alloc = *nbin;
 +        srenew(*bin, *nbin_alloc);
 +    }
 +
 +    /* reset the histogram */
 +    for (i = 0; i < (*nbin); i++)
 +    {
 +        (*bin)[i] = 0;
 +    }
 +
 +    /* now add the actual data */
 +    for (i = 0; i < sc->nsamples; i++)
 +    {
 +        if (sc->s[i]->hist)
 +        {
 +            hist_t *hist = sc->s[i]->hist;
 +            for (k = 0; k < hist->nhist; k++)
 +            {
 +                double hdx       = hist->dx[k];
 +                double xmin_hist = hist->x0[k]*hdx;
 +                for (j = 0; j < hist->nbin[k]; j++)
 +                {
 +                    /* calculate the bin corresponding to the middle of the
 +                       original bin */
 +                    double x     = hdx*(j+0.5) + xmin_hist;
 +                    int    binnr = (int)((x-(*xmin))/(*dx));
 +
 +                    if (binnr >= *nbin || binnr < 0)
 +                    {
 +                        binnr = (*nbin)-1;
 +                    }
 +
 +                    (*bin)[binnr] += hist->bin[k][j];
 +                }
 +            }
 +        }
 +        else
 +        {
 +            int starti = sc->r[i].start;
 +            int endi   = sc->r[i].end;
 +            for (j = starti; j < endi; j++)
 +            {
 +                int binnr = (int)((sc->s[i]->du[j]-(*xmin))/(*dx));
 +                if (binnr >= *nbin || binnr < 0)
 +                {
 +                    binnr = (*nbin)-1;
 +                }
 +
 +                (*bin)[binnr]++;
 +            }
 +        }
 +    }
 +}
 +
 +/* write a collection of histograms to a file */
 +void sim_data_histogram(sim_data_t *sd, const char *filename,
 +                        int nbin_default, const output_env_t oenv)
 +{
 +    char           label_x[STRLEN];
 +    const char    *dhdl    = "dH/d\\lambda", *deltag = "\\DeltaH", *lambda = "\\lambda";
 +    const char    *title   = "N(\\DeltaH)";
 +    const char    *label_y = "Samples";
 +    FILE          *fp;
 +    lambda_data_t *bl;
 +    int            nsets     = 0;
 +    char         **setnames  = NULL;
 +    gmx_bool       first_set = FALSE;
 +    /* histogram data: */
 +    int           *hist       = NULL;
 +    int            nbin       = 0;
 +    int            nbin_alloc = 0;
 +    double         dx         = 0;
 +    double         min        = 0;
 +    int            i;
 +    lambda_data_t *bl_head = sd->lb;
 +
 +    printf("\nWriting histogram to %s\n", filename);
 +    sprintf(label_x, "\\DeltaH (%s)", unit_energy);
 +
 +    fp = xvgropen_type(filename, title, label_x, label_y, exvggtXNY, oenv);
 +
 +    /* first get all the set names */
 +    bl = bl_head->next;
 +    /* iterate over all lambdas */
 +    while (bl != bl_head)
 +    {
 +        sample_coll_t *sc = bl->sc->next;
 +
 +        /* iterate over all samples */
 +        while (sc != bl->sc)
 +        {
 +            char buf[STRLEN], buf2[STRLEN];
 +
 +            nsets++;
 +            srenew(setnames, nsets);
 +            snew(setnames[nsets-1], STRLEN);
 +            if (sc->foreign_lambda->dhdl < 0)
 +            {
 +                lambda_vec_print(sc->native_lambda, buf, FALSE);
 +                lambda_vec_print(sc->foreign_lambda, buf2, FALSE);
 +                sprintf(setnames[nsets-1], "N(%s(%s=%s) | %s=%s)",
 +                        deltag, lambda, buf2, lambda, buf);
 +            }
 +            else
 +            {
 +                lambda_vec_print(sc->native_lambda, buf, FALSE);
 +                sprintf(setnames[nsets-1], "N(%s | %s=%s)",
 +                        dhdl, lambda, buf);
 +            }
 +            sc = sc->next;
 +        }
 +
 +        bl = bl->next;
 +    }
 +    xvgr_legend(fp, nsets, (const char**)setnames, oenv);
 +
 +
 +    /* now make the histograms */
 +    bl = bl_head->next;
 +    /* iterate over all lambdas */
 +    while (bl != bl_head)
 +    {
 +        sample_coll_t *sc = bl->sc->next;
 +
 +        /* iterate over all samples */
 +        while (sc != bl->sc)
 +        {
 +            if (!first_set)
 +            {
 +                xvgr_new_dataset(fp, 0, 0, NULL, oenv);
 +            }
 +
 +            sample_coll_make_hist(sc, &hist, &nbin_alloc, &nbin, &dx, &min,
 +                                  nbin_default);
 +
 +            for (i = 0; i < nbin; i++)
 +            {
 +                double xmin = i*dx + min;
 +                double xmax = (i+1)*dx + min;
 +
 +                fprintf(fp, "%g %d\n%g %d\n", xmin, hist[i], xmax, hist[i]);
 +            }
 +
 +            first_set = FALSE;
 +            sc        = sc->next;
 +        }
 +
 +        bl = bl->next;
 +    }
 +
 +    if (hist)
 +    {
 +        sfree(hist);
 +    }
 +
 +    xvgrclose(fp);
 +}
 +
 +/* create a collection (array) of barres_t object given a ordered linked list
 +   of barlamda_t sample collections */
 +static barres_t *barres_list_create(sim_data_t *sd, int *nres,
 +                                    gmx_bool use_dhdl)
 +{
 +    lambda_data_t *bl;
 +    int            nlambda = 0;
 +    barres_t      *res;
 +    int            i;
 +    gmx_bool       dhdl    = FALSE;
 +    gmx_bool       first   = TRUE;
 +    lambda_data_t *bl_head = sd->lb;
 +
 +    /* first count the lambdas */
 +    bl = bl_head->next;
 +    while (bl != bl_head)
 +    {
 +        nlambda++;
 +        bl = bl->next;
 +    }
 +    snew(res, nlambda-1);
 +
 +    /* next put the right samples in the res */
 +    *nres = 0;
 +    bl    = bl_head->next->next; /* we start with the second one. */
 +    while (bl != bl_head)
 +    {
 +        sample_coll_t *sc, *scprev;
 +        barres_t      *br = &(res[*nres]);
 +        /* there is always a previous one. we search for that as a foreign
 +           lambda: */
 +        scprev = lambda_data_find_sample_coll(bl->prev, bl->lambda);
 +        sc     = lambda_data_find_sample_coll(bl, bl->prev->lambda);
 +
 +        barres_init(br);
 +
 +        if (use_dhdl)
 +        {
 +            /* we use dhdl */
 +
 +            scprev = lambda_data_find_sample_coll(bl->prev, bl->prev->lambda);
 +            sc     = lambda_data_find_sample_coll(bl, bl->lambda);
 +
 +            if (first)
 +            {
 +                printf("\nWARNING: Using the derivative data (dH/dlambda) to extrapolate delta H values.\nThis will only work if the Hamiltonian is linear in lambda.\n");
 +                dhdl = TRUE;
 +            }
 +            if (!dhdl)
 +            {
 +                gmx_fatal(FARGS, "Some dhdl files contain only one value (dH/dl), while others \ncontain multiple values (dH/dl and/or Delta H), will not proceed \nbecause of possible inconsistencies.\n");
 +            }
 +        }
 +        else if (!scprev && !sc)
 +        {
-             gmx_fatal(FARGS, "There is no path from lambda=%g -> %g that is covered by foreign lambdas:\ncannot proceed with BAR.\nUse thermodynamic integration of dH/dl by calculating the averages of dH/dl\nwith g_analyze and integrating them.\nAlternatively, use the -extp option if (and only if) the Hamiltonian\ndepends linearly on lambda, which is NOT normally the case.\n", bl->prev->lambda, bl->lambda);
++            gmx_fatal(FARGS, "There is no path from lambda=%f -> %f that is covered by foreign lambdas:\ncannot proceed with BAR.\nUse thermodynamic integration of dH/dl by calculating the averages of dH/dl\nwith g_analyze and integrating them.\nAlternatively, use the -extp option if (and only if) the Hamiltonian\ndepends linearly on lambda, which is NOT normally the case.\n", bl->prev->lambda, bl->lambda);
 +        }
 +
 +        /* normal delta H */
 +        if (!scprev)
 +        {
-             gmx_fatal(FARGS, "Could not find a set for foreign lambda = %g\nin the files for lambda = %g", bl->lambda, bl->prev->lambda);
++            gmx_fatal(FARGS, "Could not find a set for foreign lambda = %f\nin the files for lambda = %f", bl->lambda, bl->prev->lambda);
 +        }
 +        if (!sc)
 +        {
-             gmx_fatal(FARGS, "Could not find a set for foreign lambda = %g\nin the files for lambda = %g", bl->prev->lambda, bl->lambda);
++            gmx_fatal(FARGS, "Could not find a set for foreign lambda = %f\nin the files for lambda = %f", bl->prev->lambda, bl->lambda);
 +        }
 +        br->a = scprev;
 +        br->b = sc;
 +
 +        first = FALSE;
 +        (*nres)++;
 +        bl = bl->next;
 +    }
 +    return res;
 +}
 +
 +/* estimate the maximum discretization error */
 +static double barres_list_max_disc_err(barres_t *res, int nres)
 +{
 +    int    i, j;
 +    double disc_err = 0.;
 +    double delta_lambda;
 +
 +    for (i = 0; i < nres; i++)
 +    {
 +        barres_t *br = &(res[i]);
 +
 +        delta_lambda = lambda_vec_abs_diff(br->b->native_lambda,
 +                                           br->a->native_lambda);
 +
 +        for (j = 0; j < br->a->nsamples; j++)
 +        {
 +            if (br->a->s[j]->hist)
 +            {
 +                double Wfac = 1.;
 +                if (br->a->s[j]->derivative)
 +                {
 +                    Wfac =  delta_lambda;
 +                }
 +
 +                disc_err = max(disc_err, Wfac*br->a->s[j]->hist->dx[0]);
 +            }
 +        }
 +        for (j = 0; j < br->b->nsamples; j++)
 +        {
 +            if (br->b->s[j]->hist)
 +            {
 +                double Wfac = 1.;
 +                if (br->b->s[j]->derivative)
 +                {
 +                    Wfac =  delta_lambda;
 +                }
 +                disc_err = max(disc_err, Wfac*br->b->s[j]->hist->dx[0]);
 +            }
 +        }
 +    }
 +    return disc_err;
 +}
 +
 +
 +/* impose start and end times on a sample collection, updating sample_ranges */
 +static void sample_coll_impose_times(sample_coll_t *sc, double begin_t,
 +                                     double end_t)
 +{
 +    int i;
 +    for (i = 0; i < sc->nsamples; i++)
 +    {
 +        samples_t      *s = sc->s[i];
 +        sample_range_t *r = &(sc->r[i]);
 +        if (s->hist)
 +        {
 +            double end_time = s->hist->delta_time*s->hist->sum +
 +                s->hist->start_time;
 +            if (s->hist->start_time < begin_t || end_time > end_t)
 +            {
 +                r->use = FALSE;
 +            }
 +        }
 +        else
 +        {
 +            if (!s->t)
 +            {
 +                double end_time;
 +                if (s->start_time < begin_t)
 +                {
 +                    r->start = (int)((begin_t - s->start_time)/s->delta_time);
 +                }
 +                end_time = s->delta_time*s->ndu + s->start_time;
 +                if (end_time > end_t)
 +                {
 +                    r->end = (int)((end_t - s->start_time)/s->delta_time);
 +                }
 +            }
 +            else
 +            {
 +                int j;
 +                for (j = 0; j < s->ndu; j++)
 +                {
 +                    if (s->t[j] < begin_t)
 +                    {
 +                        r->start = j;
 +                    }
 +
 +                    if (s->t[j] >= end_t)
 +                    {
 +                        r->end = j;
 +                        break;
 +                    }
 +                }
 +            }
 +            if (r->start > r->end)
 +            {
 +                r->use = FALSE;
 +            }
 +        }
 +    }
 +    sample_coll_calc_ntot(sc);
 +}
 +
 +static void sim_data_impose_times(sim_data_t *sd, double begin, double end)
 +{
 +    double         first_t, last_t;
 +    double         begin_t, end_t;
 +    lambda_data_t *lc;
 +    lambda_data_t *head = sd->lb;
 +    int            j;
 +
 +    if (begin <= 0 && end < 0)
 +    {
 +        return;
 +    }
 +
 +    /* first determine the global start and end times */
 +    first_t = -1;
 +    last_t  = -1;
 +    lc      = head->next;
 +    while (lc != head)
 +    {
 +        sample_coll_t *sc = lc->sc->next;
 +        while (sc != lc->sc)
 +        {
 +            for (j = 0; j < sc->nsamples; j++)
 +            {
 +                double start_t, end_t;
 +
 +                start_t = sc->s[j]->start_time;
 +                end_t   =   sc->s[j]->start_time;
 +                if (sc->s[j]->hist)
 +                {
 +                    end_t += sc->s[j]->delta_time*sc->s[j]->hist->sum;
 +                }
 +                else
 +                {
 +                    if (sc->s[j]->t)
 +                    {
 +                        end_t = sc->s[j]->t[sc->s[j]->ndu-1];
 +                    }
 +                    else
 +                    {
 +                        end_t += sc->s[j]->delta_time*sc->s[j]->ndu;
 +                    }
 +                }
 +
 +                if (start_t < first_t || first_t < 0)
 +                {
 +                    first_t = start_t;
 +                }
 +                if (end_t > last_t)
 +                {
 +                    last_t = end_t;
 +                }
 +            }
 +            sc = sc->next;
 +        }
 +        lc = lc->next;
 +    }
 +
 +    /* calculate the actual times */
 +    if (begin > 0)
 +    {
 +        begin_t = begin;
 +    }
 +    else
 +    {
 +        begin_t = first_t;
 +    }
 +
 +    if (end > 0)
 +    {
 +        end_t = end;
 +    }
 +    else
 +    {
 +        end_t = last_t;
 +    }
 +    printf("\n   Samples in time interval: %.3f - %.3f\n", first_t, last_t);
 +
 +    if (begin_t > end_t)
 +    {
 +        return;
 +    }
 +    printf("Removing samples outside of: %.3f - %.3f\n", begin_t, end_t);
 +
 +    /* then impose them */
 +    lc = head->next;
 +    while (lc != head)
 +    {
 +        sample_coll_t *sc = lc->sc->next;
 +        while (sc != lc->sc)
 +        {
 +            sample_coll_impose_times(sc, begin_t, end_t);
 +            sc = sc->next;
 +        }
 +        lc = lc->next;
 +    }
 +}
 +
 +
 +/* create subsample i out of ni from an existing sample_coll */
 +static gmx_bool sample_coll_create_subsample(sample_coll_t  *sc,
 +                                             sample_coll_t *sc_orig,
 +                                             int i, int ni)
 +{
 +    int             j;
 +    int             hist_start, hist_end;
 +
 +    gmx_large_int_t ntot_start;
 +    gmx_large_int_t ntot_end;
 +    gmx_large_int_t ntot_so_far;
 +
 +    *sc = *sc_orig; /* just copy all fields */
 +
 +    /* allocate proprietary memory */
 +    snew(sc->s, sc_orig->nsamples);
 +    snew(sc->r, sc_orig->nsamples);
 +
 +    /* copy the samples */
 +    for (j = 0; j < sc_orig->nsamples; j++)
 +    {
 +        sc->s[j] = sc_orig->s[j];
 +        sc->r[j] = sc_orig->r[j]; /* copy the ranges too */
 +    }
 +
 +    /* now fix start and end fields */
 +    /* the casts avoid possible overflows */
 +    ntot_start  = (gmx_large_int_t)(sc_orig->ntot*(double)i/(double)ni);
 +    ntot_end    = (gmx_large_int_t)(sc_orig->ntot*(double)(i+1)/(double)ni);
 +    ntot_so_far = 0;
 +    for (j = 0; j < sc->nsamples; j++)
 +    {
 +        gmx_large_int_t ntot_add;
 +        gmx_large_int_t new_start, new_end;
 +
 +        if (sc->r[j].use)
 +        {
 +            if (sc->s[j]->hist)
 +            {
 +                ntot_add = sc->s[j]->hist->sum;
 +            }
 +            else
 +            {
 +                ntot_add = sc->r[j].end - sc->r[j].start;
 +            }
 +        }
 +        else
 +        {
 +            ntot_add = 0;
 +        }
 +
 +        if (!sc->s[j]->hist)
 +        {
 +            if (ntot_so_far < ntot_start)
 +            {
 +                /* adjust starting point */
 +                new_start = sc->r[j].start + (ntot_start - ntot_so_far);
 +            }
 +            else
 +            {
 +                new_start = sc->r[j].start;
 +            }
 +            /* adjust end point */
 +            new_end = sc->r[j].start + (ntot_end - ntot_so_far);
 +            if (new_end > sc->r[j].end)
 +            {
 +                new_end = sc->r[j].end;
 +            }
 +
 +            /* check if we're in range at all */
 +            if ( (new_end < new_start) || (new_start > sc->r[j].end) )
 +            {
 +                new_start = 0;
 +                new_end   = 0;
 +            }
 +            /* and write the new range */
 +            sc->r[j].start = (int)new_start;
 +            sc->r[j].end   = (int)new_end;
 +        }
 +        else
 +        {
 +            if (sc->r[j].use)
 +            {
 +                double overlap;
 +                double ntot_start_norm, ntot_end_norm;
 +                /* calculate the amount of overlap of the
 +                   desired range (ntot_start -- ntot_end) onto
 +                   the histogram range (ntot_so_far -- ntot_so_far+ntot_add)*/
 +
 +                /* first calculate normalized bounds
 +                   (where 0 is the start of the hist range, and 1 the end) */
 +                ntot_start_norm = (ntot_start-ntot_so_far)/(double)ntot_add;
 +                ntot_end_norm   = (ntot_end-ntot_so_far)/(double)ntot_add;
 +
 +                /* now fix the boundaries */
 +                ntot_start_norm = min(1, max(0., ntot_start_norm));
 +                ntot_end_norm   = max(0, min(1., ntot_end_norm));
 +
 +                /* and calculate the overlap */
 +                overlap = ntot_end_norm - ntot_start_norm;
 +
 +                if (overlap > 0.95) /* we allow for 5% slack */
 +                {
 +                    sc->r[j].use = TRUE;
 +                }
 +                else if (overlap < 0.05)
 +                {
 +                    sc->r[j].use = FALSE;
 +                }
 +                else
 +                {
 +                    return FALSE;
 +                }
 +            }
 +        }
 +        ntot_so_far += ntot_add;
 +    }
 +    sample_coll_calc_ntot(sc);
 +
 +    return TRUE;
 +}
 +
 +/* calculate minimum and maximum work values in sample collection */
 +static void sample_coll_min_max(sample_coll_t *sc, double Wfac,
 +                                double *Wmin, double *Wmax)
 +{
 +    int i, j;
 +
 +    *Wmin = FLT_MAX;
 +    *Wmax = -FLT_MAX;
 +
 +    for (i = 0; i < sc->nsamples; i++)
 +    {
 +        samples_t      *s = sc->s[i];
 +        sample_range_t *r = &(sc->r[i]);
 +        if (r->use)
 +        {
 +            if (!s->hist)
 +            {
 +                for (j = r->start; j < r->end; j++)
 +                {
 +                    *Wmin = min(*Wmin, s->du[j]*Wfac);
 +                    *Wmax = max(*Wmax, s->du[j]*Wfac);
 +                }
 +            }
 +            else
 +            {
 +                int    hd = 0; /* determine the histogram direction: */
 +                double dx;
 +                if ( (s->hist->nhist > 1) && (Wfac < 0) )
 +                {
 +                    hd = 1;
 +                }
 +                dx = s->hist->dx[hd];
 +
 +                for (j = s->hist->nbin[hd]-1; j >= 0; j--)
 +                {
 +                    *Wmin = min(*Wmin, Wfac*(s->hist->x0[hd])*dx);
 +                    *Wmax = max(*Wmax, Wfac*(s->hist->x0[hd])*dx);
 +                    /* look for the highest value bin with values */
 +                    if (s->hist->bin[hd][j] > 0)
 +                    {
 +                        *Wmin = min(*Wmin, Wfac*(j+s->hist->x0[hd]+1)*dx);
 +                        *Wmax = max(*Wmax, Wfac*(j+s->hist->x0[hd]+1)*dx);
 +                        break;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Initialize a sim_data structure */
 +static void sim_data_init(sim_data_t *sd)
 +{
 +    /* make linked list */
 +    sd->lb       = &(sd->lb_head);
 +    sd->lb->next = sd->lb;
 +    sd->lb->prev = sd->lb;
 +
 +    lambda_components_init(&(sd->lc));
 +}
 +
 +
 +static double calc_bar_sum(int n, const double *W, double Wfac, double sbMmDG)
 +{
 +    int    i;
 +    double sum;
 +
 +    sum = 0;
 +
 +    for (i = 0; i < n; i++)
 +    {
 +        sum += 1./(1. + exp(Wfac*W[i] + sbMmDG));
 +    }
 +
 +    return sum;
 +}
 +
 +/* calculate the BAR average given a histogram
 +
 +    if type== 0, calculate the best estimate for the average,
 +    if type==-1, calculate the minimum possible value given the histogram
 +    if type== 1, calculate the maximum possible value given the histogram */
 +static double calc_bar_sum_hist(const hist_t *hist, double Wfac, double sbMmDG,
 +                                int type)
 +{
 +    double sum = 0.;
 +    int    i;
 +    int    max;
 +    /* normalization factor multiplied with bin width and
 +       number of samples (we normalize through M): */
 +    double normdx = 1.;
 +    int    hd     = 0; /* determine the histogram direction: */
 +    double dx;
 +
 +    if ( (hist->nhist > 1) && (Wfac < 0) )
 +    {
 +        hd = 1;
 +    }
 +    dx  = hist->dx[hd];
 +    max = hist->nbin[hd]-1;
 +    if (type == 1)
 +    {
 +        max = hist->nbin[hd]; /* we also add whatever was out of range */
 +    }
 +
 +    for (i = 0; i < max; i++)
 +    {
 +        double x    = Wfac*((i+hist->x0[hd])+0.5)*dx; /* bin middle */
 +        double pxdx = hist->bin[0][i]*normdx;         /* p(x)dx */
 +
 +        sum += pxdx/(1. + exp(x + sbMmDG));
 +    }
 +
 +    return sum;
 +}
 +
 +static double calc_bar_lowlevel(sample_coll_t *ca, sample_coll_t *cb,
 +                                double temp, double tol, int type)
 +{
 +    double kT, beta, M;
 +    double DG;
 +    int    i, j;
 +    double Wfac1, Wfac2, Wmin, Wmax;
 +    double DG0, DG1, DG2, dDG1;
 +    double sum1, sum2;
 +    double n1, n2; /* numbers of samples as doubles */
 +
 +    kT   = BOLTZ*temp;
 +    beta = 1/kT;
 +
 +    /* count the numbers of samples */
 +    n1 = ca->ntot;
 +    n2 = cb->ntot;
 +
 +    M = log(n1/n2);
 +
 +    /*if (!lambda_vec_same(ca->native_lambda, ca->foreign_lambda))*/
 +    if (ca->foreign_lambda->dhdl < 0)
 +    {
 +        /* this is the case when the delta U were calculated directly
 +           (i.e. we're not scaling dhdl) */
 +        Wfac1 = beta;
 +        Wfac2 = beta;
 +    }
 +    else
 +    {
 +        /* we're using dhdl, so delta_lambda needs to be a
 +           multiplication factor.  */
 +        /*double delta_lambda=cb->native_lambda-ca->native_lambda;*/
 +        double delta_lambda = lambda_vec_abs_diff(cb->native_lambda,
 +                                                  ca->native_lambda);
 +        if (cb->native_lambda->lc->N > 1)
 +        {
 +            gmx_fatal(FARGS,
 +                      "Can't (yet) do multi-component dhdl interpolation");
 +        }
 +
 +        Wfac1 =  beta*delta_lambda;
 +        Wfac2 = -beta*delta_lambda;
 +    }
 +
 +    if (beta < 1)
 +    {
 +        /* We print the output both in kT and kJ/mol.
 +         * Here we determine DG in kT, so when beta < 1
 +         * the precision has to be increased.
 +         */
 +        tol *= beta;
 +    }
 +
 +    /* Calculate minimum and maximum work to give an initial estimate of
 +     * delta G  as their average.
 +     */
 +    {
 +        double Wmin1, Wmin2, Wmax1, Wmax2;
 +        sample_coll_min_max(ca, Wfac1, &Wmin1, &Wmax1);
 +        sample_coll_min_max(cb, Wfac2, &Wmin2, &Wmax2);
 +
 +        Wmin = min(Wmin1, Wmin2);
 +        Wmax = max(Wmax1, Wmax2);
 +    }
 +
 +    DG0 = Wmin;
 +    DG2 = Wmax;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "DG %9.5f %9.5f\n", DG0, DG2);
 +    }
 +    /* We approximate by bisection: given our initial estimates
 +       we keep checking whether the halfway point is greater or
 +       smaller than what we get out of the BAR averages.
 +
 +       For the comparison we can use twice the tolerance. */
 +    while (DG2 - DG0 > 2*tol)
 +    {
 +        DG1 = 0.5*(DG0 + DG2);
 +
-         /*printf("Wfac1=%g, Wfac2=%g, beta=%g, DG1=%g\n",Wfac1,Wfac2,beta,
-            DG1);*/
- 
 +        /* calculate the BAR averages */
 +        dDG1 = 0.;
 +
 +        for (i = 0; i < ca->nsamples; i++)
 +        {
 +            samples_t      *s = ca->s[i];
 +            sample_range_t *r = &(ca->r[i]);
 +            if (r->use)
 +            {
 +                if (s->hist)
 +                {
 +                    dDG1 += calc_bar_sum_hist(s->hist, Wfac1, (M-DG1), type);
 +                }
 +                else
 +                {
 +                    dDG1 += calc_bar_sum(r->end - r->start, s->du + r->start,
 +                                         Wfac1, (M-DG1));
 +                }
 +            }
 +        }
 +        for (i = 0; i < cb->nsamples; i++)
 +        {
 +            samples_t      *s = cb->s[i];
 +            sample_range_t *r = &(cb->r[i]);
 +            if (r->use)
 +            {
 +                if (s->hist)
 +                {
 +                    dDG1 -= calc_bar_sum_hist(s->hist, Wfac2, -(M-DG1), type);
 +                }
 +                else
 +                {
 +                    dDG1 -= calc_bar_sum(r->end - r->start, s->du + r->start,
 +                                         Wfac2, -(M-DG1));
 +                }
 +            }
 +        }
 +
 +        if (dDG1 < 0)
 +        {
 +            DG0 = DG1;
 +        }
 +        else
 +        {
 +            DG2 = DG1;
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "DG %9.5f %9.5f\n", DG0, DG2);
 +        }
 +    }
 +
 +    return 0.5*(DG0 + DG2);
 +}
 +
 +static void calc_rel_entropy(sample_coll_t *ca, sample_coll_t *cb,
 +                             double temp, double dg, double *sa, double *sb)
 +{
 +    int    i, j;
 +    double W_ab = 0.;
 +    double W_ba = 0.;
 +    double kT, beta;
 +    double Wfac1, Wfac2;
 +    double n1, n2;
 +
 +    kT   = BOLTZ*temp;
 +    beta = 1/kT;
 +
 +    /* count the numbers of samples */
 +    n1 = ca->ntot;
 +    n2 = cb->ntot;
 +
 +    /* to ensure the work values are the same as during the delta_G */
 +    /*if (!lambda_vec_same(ca->native_lambda, ca->foreign_lambda))*/
 +    if (ca->foreign_lambda->dhdl < 0)
 +    {
 +        /* this is the case when the delta U were calculated directly
 +           (i.e. we're not scaling dhdl) */
 +        Wfac1 = beta;
 +        Wfac2 = beta;
 +    }
 +    else
 +    {
 +        /* we're using dhdl, so delta_lambda needs to be a
 +           multiplication factor.  */
 +        double delta_lambda = lambda_vec_abs_diff(cb->native_lambda,
 +                                                  ca->native_lambda);
 +        Wfac1 =  beta*delta_lambda;
 +        Wfac2 = -beta*delta_lambda;
 +    }
 +
 +    /* first calculate the average work in both directions */
 +    for (i = 0; i < ca->nsamples; i++)
 +    {
 +        samples_t      *s = ca->s[i];
 +        sample_range_t *r = &(ca->r[i]);
 +        if (r->use)
 +        {
 +            if (!s->hist)
 +            {
 +                for (j = r->start; j < r->end; j++)
 +                {
 +                    W_ab += Wfac1*s->du[j];
 +                }
 +            }
 +            else
 +            {
 +                /* normalization factor multiplied with bin width and
 +                   number of samples (we normalize through M): */
 +                double normdx = 1.;
 +                double dx;
 +                int    hd = 0; /* histogram direction */
 +                if ( (s->hist->nhist > 1) && (Wfac1 < 0) )
 +                {
 +                    hd = 1;
 +                }
 +                dx = s->hist->dx[hd];
 +
 +                for (j = 0; j < s->hist->nbin[0]; j++)
 +                {
 +                    double x    = Wfac1*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/
 +                    double pxdx = s->hist->bin[0][j]*normdx;         /* p(x)dx */
 +                    W_ab += pxdx*x;
 +                }
 +            }
 +        }
 +    }
 +    W_ab /= n1;
 +
 +    for (i = 0; i < cb->nsamples; i++)
 +    {
 +        samples_t      *s = cb->s[i];
 +        sample_range_t *r = &(cb->r[i]);
 +        if (r->use)
 +        {
 +            if (!s->hist)
 +            {
 +                for (j = r->start; j < r->end; j++)
 +                {
 +                    W_ba += Wfac1*s->du[j];
 +                }
 +            }
 +            else
 +            {
 +                /* normalization factor multiplied with bin width and
 +                   number of samples (we normalize through M): */
 +                double normdx = 1.;
 +                double dx;
 +                int    hd = 0; /* histogram direction */
 +                if ( (s->hist->nhist > 1) && (Wfac2 < 0) )
 +                {
 +                    hd = 1;
 +                }
 +                dx = s->hist->dx[hd];
 +
 +                for (j = 0; j < s->hist->nbin[0]; j++)
 +                {
 +                    double x    = Wfac1*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/
 +                    double pxdx = s->hist->bin[0][j]*normdx;         /* p(x)dx */
 +                    W_ba += pxdx*x;
 +                }
 +            }
 +        }
 +    }
 +    W_ba /= n2;
 +
 +    /* then calculate the relative entropies */
 +    *sa = (W_ab - dg);
 +    *sb = (W_ba + dg);
 +}
 +
 +static void calc_dg_stddev(sample_coll_t *ca, sample_coll_t *cb,
 +                           double temp, double dg, double *stddev)
 +{
 +    int    i, j;
 +    double M;
 +    double sigmafact = 0.;
 +    double kT, beta;
 +    double Wfac1, Wfac2;
 +    double n1, n2;
 +
 +    kT   = BOLTZ*temp;
 +    beta = 1/kT;
 +
 +    /* count the numbers of samples */
 +    n1 = ca->ntot;
 +    n2 = cb->ntot;
 +
 +    /* to ensure the work values are the same as during the delta_G */
 +    /*if (!lambda_vec_same(ca->native_lambda, ca->foreign_lambda))*/
 +    if (ca->foreign_lambda->dhdl < 0)
 +    {
 +        /* this is the case when the delta U were calculated directly
 +           (i.e. we're not scaling dhdl) */
 +        Wfac1 = beta;
 +        Wfac2 = beta;
 +    }
 +    else
 +    {
 +        /* we're using dhdl, so delta_lambda needs to be a
 +           multiplication factor.  */
 +        double delta_lambda = lambda_vec_abs_diff(cb->native_lambda,
 +                                                  ca->native_lambda);
 +        Wfac1 =  beta*delta_lambda;
 +        Wfac2 = -beta*delta_lambda;
 +    }
 +
 +    M = log(n1/n2);
 +
 +
 +    /* calculate average in both directions */
 +    for (i = 0; i < ca->nsamples; i++)
 +    {
 +        samples_t      *s = ca->s[i];
 +        sample_range_t *r = &(ca->r[i]);
 +        if (r->use)
 +        {
 +            if (!s->hist)
 +            {
 +                for (j = r->start; j < r->end; j++)
 +                {
 +                    sigmafact += 1./(2. + 2.*cosh((M + Wfac1*s->du[j] - dg)));
 +                }
 +            }
 +            else
 +            {
 +                /* normalization factor multiplied with bin width and
 +                   number of samples (we normalize through M): */
 +                double normdx = 1.;
 +                double dx;
 +                int    hd = 0; /* histogram direction */
 +                if ( (s->hist->nhist > 1) && (Wfac1 < 0) )
 +                {
 +                    hd = 1;
 +                }
 +                dx = s->hist->dx[hd];
 +
 +                for (j = 0; j < s->hist->nbin[0]; j++)
 +                {
 +                    double x    = Wfac1*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/
 +                    double pxdx = s->hist->bin[0][j]*normdx;         /* p(x)dx */
 +
 +                    sigmafact += pxdx/(2. + 2.*cosh((M + x - dg)));
 +                }
 +            }
 +        }
 +    }
 +    for (i = 0; i < cb->nsamples; i++)
 +    {
 +        samples_t      *s = cb->s[i];
 +        sample_range_t *r = &(cb->r[i]);
 +        if (r->use)
 +        {
 +            if (!s->hist)
 +            {
 +                for (j = r->start; j < r->end; j++)
 +                {
 +                    sigmafact += 1./(2. + 2.*cosh((M - Wfac2*s->du[j] - dg)));
 +                }
 +            }
 +            else
 +            {
 +                /* normalization factor multiplied with bin width and
 +                   number of samples (we normalize through M): */
 +                double normdx = 1.;
 +                double dx;
 +                int    hd = 0; /* histogram direction */
 +                if ( (s->hist->nhist > 1) && (Wfac2 < 0) )
 +                {
 +                    hd = 1;
 +                }
 +                dx = s->hist->dx[hd];
 +
 +                for (j = 0; j < s->hist->nbin[0]; j++)
 +                {
 +                    double x    = Wfac2*((j+s->hist->x0[0])+0.5)*dx; /*bin ctr*/
 +                    double pxdx = s->hist->bin[0][j]*normdx;         /* p(x)dx */
 +
 +                    sigmafact += pxdx/(2. + 2.*cosh((M - x - dg)));
 +                }
 +            }
 +        }
 +    }
 +
 +    sigmafact /= (n1 + n2);
 +
 +
 +    /* Eq. 10 from
 +       Shirts, Bair, Hooker & Pande, Phys. Rev. Lett 91, 140601 (2003): */
 +    *stddev = sqrt(((1./sigmafact) - ( (n1+n2)/n1 + (n1+n2)/n2 )));
 +}
 +
 +
 +
 +static void calc_bar(barres_t *br, double tol,
 +                     int npee_min, int npee_max, gmx_bool *bEE,
 +                     double *partsum)
 +{
 +    int      npee, p;
 +    double   dg_sig2, sa_sig2, sb_sig2, stddev_sig2; /* intermediate variance values
 +                                                        for calculated quantities */
 +    int      nsample1, nsample2;
 +    double   temp = br->a->temp;
 +    int      i, j;
 +    double   dg_min, dg_max;
 +    gmx_bool have_hist = FALSE;
 +
 +    br->dg = calc_bar_lowlevel(br->a, br->b, temp, tol, 0);
 +
 +    br->dg_disc_err      = 0.;
 +    br->dg_histrange_err = 0.;
 +
 +    /* check if there are histograms */
 +    for (i = 0; i < br->a->nsamples; i++)
 +    {
 +        if (br->a->r[i].use && br->a->s[i]->hist)
 +        {
 +            have_hist = TRUE;
 +            break;
 +        }
 +    }
 +    if (!have_hist)
 +    {
 +        for (i = 0; i < br->b->nsamples; i++)
 +        {
 +            if (br->b->r[i].use && br->b->s[i]->hist)
 +            {
 +                have_hist = TRUE;
 +                break;
 +            }
 +        }
 +    }
 +
 +    /* calculate histogram-specific errors */
 +    if (have_hist)
 +    {
 +        dg_min = calc_bar_lowlevel(br->a, br->b, temp, tol, -1);
 +        dg_max = calc_bar_lowlevel(br->a, br->b, temp, tol, 1);
 +
 +        if (fabs(dg_max - dg_min) > GMX_REAL_EPS*10)
 +        {
 +            /* the histogram range  error is the biggest of the differences
 +               between the best estimate and the extremes */
 +            br->dg_histrange_err = fabs(dg_max - dg_min);
 +        }
 +        br->dg_disc_err = 0.;
 +        for (i = 0; i < br->a->nsamples; i++)
 +        {
 +            if (br->a->s[i]->hist)
 +            {
 +                br->dg_disc_err = max(br->dg_disc_err, br->a->s[i]->hist->dx[0]);
 +            }
 +        }
 +        for (i = 0; i < br->b->nsamples; i++)
 +        {
 +            if (br->b->s[i]->hist)
 +            {
 +                br->dg_disc_err = max(br->dg_disc_err, br->b->s[i]->hist->dx[0]);
 +            }
 +        }
 +    }
 +    calc_rel_entropy(br->a, br->b, temp, br->dg, &(br->sa), &(br->sb));
 +
 +    calc_dg_stddev(br->a, br->b, temp, br->dg, &(br->dg_stddev) );
 +
 +    dg_sig2     = 0;
 +    sa_sig2     = 0;
 +    sb_sig2     = 0;
 +    stddev_sig2 = 0;
 +
 +    *bEE = TRUE;
 +    {
 +        sample_coll_t ca, cb;
 +
 +        /* initialize the samples */
 +        sample_coll_init(&ca, br->a->native_lambda, br->a->foreign_lambda,
 +                         br->a->temp);
 +        sample_coll_init(&cb, br->b->native_lambda, br->b->foreign_lambda,
 +                         br->b->temp);
 +
 +        for (npee = npee_min; npee <= npee_max; npee++)
 +        {
 +            double dgs      = 0;
 +            double dgs2     = 0;
 +            double dsa      = 0;
 +            double dsb      = 0;
 +            double dsa2     = 0;
 +            double dsb2     = 0;
 +            double dstddev  = 0;
 +            double dstddev2 = 0;
 +
 +
 +            for (p = 0; p < npee; p++)
 +            {
 +                double   dgp;
 +                double   stddevc;
 +                double   sac, sbc;
 +                gmx_bool cac, cbc;
 +
 +                cac = sample_coll_create_subsample(&ca, br->a, p, npee);
 +                cbc = sample_coll_create_subsample(&cb, br->b, p, npee);
 +
 +                if (!cac || !cbc)
 +                {
 +                    printf("WARNING: histogram number incompatible with block number for averaging: can't do error estimate\n");
 +                    *bEE = FALSE;
 +                    if (cac)
 +                    {
 +                        sample_coll_destroy(&ca);
 +                    }
 +                    if (cbc)
 +                    {
 +                        sample_coll_destroy(&cb);
 +                    }
 +                    return;
 +                }
 +
 +                dgp   = calc_bar_lowlevel(&ca, &cb, temp, tol, 0);
 +                dgs  += dgp;
 +                dgs2 += dgp*dgp;
 +
 +                partsum[npee*(npee_max+1)+p] += dgp;
 +
 +                calc_rel_entropy(&ca, &cb, temp, dgp, &sac, &sbc);
 +                dsa  += sac;
 +                dsa2 += sac*sac;
 +                dsb  += sbc;
 +                dsb2 += sbc*sbc;
 +                calc_dg_stddev(&ca, &cb, temp, dgp, &stddevc );
 +
 +                dstddev  += stddevc;
 +                dstddev2 += stddevc*stddevc;
 +
 +                sample_coll_destroy(&ca);
 +                sample_coll_destroy(&cb);
 +            }
 +            dgs     /= npee;
 +            dgs2    /= npee;
 +            dg_sig2 += (dgs2-dgs*dgs)/(npee-1);
 +
 +            dsa     /= npee;
 +            dsa2    /= npee;
 +            dsb     /= npee;
 +            dsb2    /= npee;
 +            sa_sig2 += (dsa2-dsa*dsa)/(npee-1);
 +            sb_sig2 += (dsb2-dsb*dsb)/(npee-1);
 +
 +            dstddev     /= npee;
 +            dstddev2    /= npee;
 +            stddev_sig2 += (dstddev2-dstddev*dstddev)/(npee-1);
 +        }
 +        br->dg_err        = sqrt(dg_sig2/(npee_max - npee_min + 1));
 +        br->sa_err        = sqrt(sa_sig2/(npee_max - npee_min + 1));
 +        br->sb_err        = sqrt(sb_sig2/(npee_max - npee_min + 1));
 +        br->dg_stddev_err = sqrt(stddev_sig2/(npee_max - npee_min + 1));
 +    }
 +}
 +
 +
 +static double bar_err(int nbmin, int nbmax, const double *partsum)
 +{
 +    int    nb, b;
 +    double svar, s, s2, dg;
 +
 +    svar = 0;
 +    for (nb = nbmin; nb <= nbmax; nb++)
 +    {
 +        s  = 0;
 +        s2 = 0;
 +        for (b = 0; b < nb; b++)
 +        {
 +            dg  = partsum[nb*(nbmax+1)+b];
 +            s  += dg;
 +            s2 += dg*dg;
 +        }
 +        s    /= nb;
 +        s2   /= nb;
 +        svar += (s2 - s*s)/(nb - 1);
 +    }
 +
 +    return sqrt(svar/(nbmax + 1 - nbmin));
 +}
 +
 +
 +/* Seek the end of an identifier (consecutive non-spaces), followed by
 +   an optional number of spaces or '='-signs. Returns a pointer to the
 +   first non-space value found after that. Returns NULL if the string
 +   ends before that.
 + */
 +static const char *find_value(const char *str)
 +{
 +    gmx_bool name_end_found = FALSE;
 +
 +    /* if the string is a NULL pointer, return a NULL pointer. */
 +    if (str == NULL)
 +    {
 +        return NULL;
 +    }
 +    while (*str != '\0')
 +    {
 +        /* first find the end of the name */
 +        if (!name_end_found)
 +        {
 +            if (isspace(*str) || (*str == '=') )
 +            {
 +                name_end_found = TRUE;
 +            }
 +        }
 +        else
 +        {
 +            if (!( isspace(*str) || (*str == '=') ))
 +            {
 +                return str;
 +            }
 +        }
 +        str++;
 +    }
 +    return NULL;
 +}
 +
 +
 +
 +/* read a vector-notation description of a lambda vector */
 +static gmx_bool read_lambda_compvec(const char                *str,
 +                                    lambda_vec_t              *lv,
 +                                    const lambda_components_t *lc_in,
 +                                    lambda_components_t       *lc_out,
 +                                    const char               **end,
 +                                    const char                *fn)
 +{
 +    gmx_bool    initialize_lc = FALSE; /* whether to initialize the lambda
 +                                          components, or to check them */
 +    gmx_bool    start_reached = FALSE; /* whether the start of component names
 +                                          has been reached */
 +    gmx_bool    vector        = FALSE; /* whether there are multiple components */
 +    int         n             = 0;     /* current component number */
 +    const char *val_start     = NULL;  /* start of the component name, or NULL
 +                                          if not in a value */
 +    char       *strtod_end;
 +    gmx_bool    OK = TRUE;
 +
 +    if (end)
 +    {
 +        *end = str;
 +    }
 +
 +
 +    if (lc_out && lc_out->N == 0)
 +    {
 +        initialize_lc = TRUE;
 +    }
 +
 +    if (lc_in == NULL)
 +    {
 +        lc_in = lc_out;
 +    }
 +
 +    while (1)
 +    {
 +        if (!start_reached)
 +        {
 +            if (isalnum(*str))
 +            {
 +                vector        = FALSE;
 +                start_reached = TRUE;
 +                val_start     = str;
 +            }
 +            else if (*str == '(')
 +            {
 +                vector        = TRUE;
 +                start_reached = TRUE;
 +            }
 +            else if (!isspace(*str))
 +            {
 +                gmx_fatal(FARGS, "Error in lambda components in %s", fn);
 +            }
 +        }
 +        else
 +        {
 +            if (val_start)
 +            {
 +                if (isspace(*str) || *str == ')' || *str == ',' || *str == '\0')
 +                {
 +                    /* end of value */
 +                    if (lv == NULL)
 +                    {
 +                        if (initialize_lc)
 +                        {
 +                            lambda_components_add(lc_out, val_start,
 +                                                  (str-val_start));
 +                        }
 +                        else
 +                        {
 +                            if (!lambda_components_check(lc_out, n, val_start,
 +                                                         (str-val_start)))
 +                            {
 +                                return FALSE;
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* add a vector component to lv */
 +                        lv->val[n] = strtod(val_start, &strtod_end);
 +                        if (val_start == strtod_end)
 +                        {
 +                            gmx_fatal(FARGS,
 +                                      "Error reading lambda vector in %s", fn);
 +                        }
 +                    }
 +                    /* reset for the next identifier */
 +                    val_start = NULL;
 +                    n++;
 +                    if (!vector)
 +                    {
 +                        return OK;
 +                    }
 +                }
 +            }
 +            else if (isalnum(*str))
 +            {
 +                val_start = str;
 +            }
 +            if (*str == ')')
 +            {
 +                str++;
 +                if (end)
 +                {
 +                    *end = str;
 +                }
 +                if (!vector)
 +                {
 +                    gmx_fatal(FARGS, "Error in lambda components in %s", fn);
 +                }
 +                else
 +                {
 +                    if (n == lc_in->N)
 +                    {
 +                        return OK;
 +                    }
 +                    else if (lv == NULL)
 +                    {
 +                        return FALSE;
 +                    }
 +                    else
 +                    {
 +                        gmx_fatal(FARGS, "Incomplete lambda vector data in %s",
 +                                  fn);
 +                        return FALSE;
 +                    }
 +
 +                }
 +            }
 +        }
 +        if (*str == '\0')
 +        {
 +            break;
 +        }
 +        str++;
 +        if (end)
 +        {
 +            *end = str;
 +        }
 +    }
 +    if (vector)
 +    {
 +        gmx_fatal(FARGS, "Incomplete lambda components data in %s", fn);
 +        return FALSE;
 +    }
 +    return OK;
 +}
 +
 +/* read and check the component names from a string */
 +static gmx_bool read_lambda_components(const char          *str,
 +                                       lambda_components_t *lc,
 +                                       const char         **end,
 +                                       const char          *fn)
 +{
 +    return read_lambda_compvec(str, NULL, NULL, lc, end, fn);
 +}
 +
 +/* read an initialized lambda vector from a string */
 +static gmx_bool read_lambda_vector(const char   *str,
 +                                   lambda_vec_t *lv,
 +                                   const char  **end,
 +                                   const char   *fn)
 +{
 +    return read_lambda_compvec(str, lv, lv->lc, NULL, end, fn);
 +}
 +
 +
 +
 +/* deduce lambda value from legend.
 +    fn = the file name
 +    legend = the legend string
 +    ba = the xvg data
 +    lam = the initialized lambda vector
 +   returns whether to use the data in this set.
 + */
 +static gmx_bool legend2lambda(const char   *fn,
 +                              const char   *legend,
 +                              xvg_t        *ba,
 +                              lambda_vec_t *lam)
 +{
 +    double        lambda = 0;
 +    const char   *ptr    = NULL, *ptr2 = NULL;
 +    gmx_bool      ok     = FALSE;
 +    gmx_bool      bdhdl  = FALSE;
 +    const char   *tostr  = " to ";
 +
 +    if (legend == NULL)
 +    {
 +        gmx_fatal(FARGS, "There is no legend in file '%s', can not deduce lambda", fn);
 +    }
 +
 +    /* look for the last 'to': */
 +    ptr2 = legend;
 +    do
 +    {
 +        ptr2 = strstr(ptr2, tostr);
 +        if (ptr2 != NULL)
 +        {
 +            ptr = ptr2;
 +            ptr2++;
 +        }
 +    }
 +    while (ptr2 != NULL && *ptr2 != '\0');
 +
 +    if (ptr)
 +    {
 +        ptr += strlen(tostr)-1; /* and advance past that 'to' */
 +    }
 +    else
 +    {
 +        /* look for the = sign */
 +        ptr = strrchr(legend, '=');
 +        if (!ptr)
 +        {
 +            /* otherwise look for the last space */
 +            ptr = strrchr(legend, ' ');
 +        }
 +    }
 +
 +    if (strstr(legend, "dH"))
 +    {
 +        ok    = TRUE;
 +        bdhdl = TRUE;
 +    }
 +    else if (strchr(legend, 'D') != NULL && strchr(legend, 'H') != NULL)
 +    {
 +        ok    = TRUE;
 +        bdhdl = FALSE;
 +    }
 +    else /*if (strstr(legend, "pV"))*/
 +    {
 +        return FALSE;
 +    }
 +    if (!ptr)
 +    {
 +        ok = FALSE;
 +    }
 +
 +    if (!ok)
 +    {
 +        gmx_fatal(FARGS, "There is no proper lambda legend in file '%s', can not deduce lambda", fn);
 +    }
 +    if (!bdhdl)
 +    {
 +        ptr = find_value(ptr);
 +        if (!ptr || !read_lambda_vector(ptr, lam, NULL, fn))
 +        {
 +            gmx_fatal(FARGS, "lambda vector '%s' %s faulty", legend, fn);
 +        }
 +    }
 +    else
 +    {
 +        int         dhdl_index;
 +        const char *end;
 +        char        buf[STRLEN];
 +
 +        ptr = strrchr(legend, '=');
 +        end = ptr;
 +        if (ptr)
 +        {
 +            /* there must be a component name */
 +            ptr--;
 +            if (ptr < legend)
 +            {
 +                gmx_fatal(FARGS, "dhdl legend '%s' %s faulty", legend, fn);
 +            }
 +            /* now backtrack to the start of the identifier */
 +            while (isspace(*ptr))
 +            {
 +                end = ptr;
 +                ptr--;
 +                if (ptr < legend)
 +                {
 +                    gmx_fatal(FARGS, "dhdl legend '%s' %s faulty", legend, fn);
 +                }
 +            }
 +            while (!isspace(*ptr))
 +            {
 +                ptr--;
 +                if (ptr < legend)
 +                {
 +                    gmx_fatal(FARGS, "dhdl legend '%s' %s faulty", legend, fn);
 +                }
 +            }
 +            ptr++;
 +            strncpy(buf, ptr, (end-ptr));
 +            buf[(end-ptr)] = '\0';
 +            dhdl_index     = lambda_components_find(lam->lc, ptr, (end-ptr));
 +            if (dhdl_index < 0)
 +            {
 +                char buf[STRLEN];
 +                strncpy(buf, ptr, (end-ptr));
 +                buf[(end-ptr)] = '\0';
 +                gmx_fatal(FARGS,
 +                          "Did not find lambda component for '%s' in %s",
 +                          buf, fn);
 +            }
 +        }
 +        else
 +        {
 +            if (lam->lc->N > 1)
 +            {
 +                gmx_fatal(FARGS,
 +                          "dhdl without component name with >1 lambda component in %s",
 +                          fn);
 +            }
 +            dhdl_index = 0;
 +        }
 +        lam->dhdl = dhdl_index;
 +    }
 +    return TRUE;
 +}
 +
 +static gmx_bool subtitle2lambda(const char *subtitle, xvg_t *ba, const char *fn,
 +                                lambda_components_t *lc)
 +{
 +    gmx_bool    bFound;
 +    const char *ptr;
 +    char       *end;
 +    double      native_lambda;
 +
 +    bFound = FALSE;
 +
 +    /* first check for a state string */
 +    ptr = strstr(subtitle, "state");
 +    if (ptr)
 +    {
 +        int         index = -1;
 +        const char *val_end;
 +
 +        /* the new 4.6 style lambda vectors */
 +        ptr = find_value(ptr);
 +        if (ptr)
 +        {
 +            index = strtol(ptr, &end, 10);
 +            if (ptr == end)
 +            {
 +                gmx_fatal(FARGS, "Incomplete state data in %s", fn);
 +                return FALSE;
 +            }
 +            ptr = end;
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS, "Incomplete state data in %s", fn);
 +            return FALSE;
 +        }
 +        /* now find the lambda vector component names */
 +        while (*ptr != '(' && !isalnum(*ptr))
 +        {
 +            ptr++;
 +            if (*ptr == '\0')
 +            {
 +                gmx_fatal(FARGS,
 +                          "Incomplete lambda vector component data in %s", fn);
 +                return FALSE;
 +            }
 +        }
 +        val_end = ptr;
 +        if (!read_lambda_components(ptr, lc, &val_end, fn))
 +        {
 +            gmx_fatal(FARGS,
 +                      "lambda vector components in %s don't match those previously read",
 +                      fn);
 +        }
 +        ptr = find_value(val_end);
 +        if (!ptr)
 +        {
 +            gmx_fatal(FARGS, "Incomplete state data in %s", fn);
 +            return FALSE;
 +        }
 +        lambda_vec_init(&(ba->native_lambda), lc);
 +        if (!read_lambda_vector(ptr, &(ba->native_lambda), NULL, fn))
 +        {
 +            gmx_fatal(FARGS, "lambda vector in %s faulty", fn);
 +        }
 +        ba->native_lambda.index = index;
 +        bFound                  = TRUE;
 +    }
 +    else
 +    {
 +        /* compatibility mode: check for lambda in other ways. */
 +        /* plain text lambda string */
 +        ptr = strstr(subtitle, "lambda");
 +        if (ptr == NULL)
 +        {
 +            /* xmgrace formatted lambda string */
 +            ptr = strstr(subtitle, "\\xl\\f{}");
 +        }
 +        if (ptr == NULL)
 +        {
 +            /* xmgr formatted lambda string */
 +            ptr = strstr(subtitle, "\\8l\\4");
 +        }
 +        if (ptr != NULL)
 +        {
 +            ptr = strstr(ptr, "=");
 +        }
 +        if (ptr != NULL)
 +        {
 +            bFound = (sscanf(ptr+1, "%lf", &(native_lambda)) == 1);
 +            /* add the lambda component name as an empty string */
 +            if (lc->N > 0)
 +            {
 +                if (!lambda_components_check(lc, 0, "", 0))
 +                {
 +                    gmx_fatal(FARGS,
 +                              "lambda vector components in %s don't match those previously read",
 +                              fn);
 +                }
 +            }
 +            else
 +            {
 +                lambda_components_add(lc, "", 0);
 +            }
 +            lambda_vec_init(&(ba->native_lambda), lc);
 +            ba->native_lambda.val[0] = native_lambda;
 +        }
 +    }
 +
 +    return bFound;
 +}
 +
 +static void filename2lambda(const char *fn, xvg_t *ba)
 +{
 +    double        lambda;
 +    const char   *ptr, *digitptr;
 +    char         *endptr;
 +    int           dirsep;
 +    ptr = fn;
 +    /* go to the end of the path string and search backward to find the last
 +       directory in the path which has to contain the value of lambda
 +     */
 +    while (ptr[1] != '\0')
 +    {
 +        ptr++;
 +    }
 +    /* searching backward to find the second directory separator */
 +    dirsep   = 0;
 +    digitptr = NULL;
 +    while (ptr >= fn)
 +    {
 +        if (ptr[0] != DIR_SEPARATOR && ptr[1] == DIR_SEPARATOR)
 +        {
 +            if (dirsep == 1)
 +            {
 +                break;
 +            }
 +            dirsep++;
 +        }
 +        /* save the last position of a digit between the last two
 +           separators = in the last dirname */
 +        if (dirsep > 0 && isdigit(*ptr))
 +        {
 +            digitptr = ptr;
 +        }
 +        ptr--;
 +    }
 +    if (!digitptr)
 +    {
 +        gmx_fatal(FARGS, "While trying to read the lambda value from the file path:"
 +                  " last directory in the path '%s' does not contain a number", fn);
 +    }
 +    if (digitptr[-1] == '-')
 +    {
 +        digitptr--;
 +    }
 +    lambda = strtod(digitptr, &endptr);
 +    if (endptr == digitptr)
 +    {
 +        gmx_fatal(FARGS, "Malformed number in file path '%s'", fn);
 +    }
 +}
 +
 +static void read_bar_xvg_lowlevel(const char *fn, real *temp, xvg_t *ba,
 +                                  lambda_components_t *lc)
 +{
 +    int          i;
 +    char        *subtitle, **legend, *ptr;
 +    int          np;
 +    gmx_bool     native_lambda_read = FALSE;
 +    char         buf[STRLEN];
 +    lambda_vec_t lv;
 +
 +    xvg_init(ba);
 +
 +    ba->filename = fn;
 +
 +    np = read_xvg_legend(fn, &ba->y, &ba->nset, &subtitle, &legend);
 +    if (!ba->y)
 +    {
 +        gmx_fatal(FARGS, "File %s contains no usable data.", fn);
 +    }
 +    /* Reorder the data */
 +    ba->t  = ba->y[0];
 +    for (i = 1; i < ba->nset; i++)
 +    {
 +        ba->y[i-1] = ba->y[i];
 +    }
 +    ba->nset--;
 +
 +    snew(ba->np, ba->nset);
 +    for (i = 0; i < ba->nset; i++)
 +    {
 +        ba->np[i] = np;
 +    }
 +
 +    ba->temp = -1;
 +    if (subtitle != NULL)
 +    {
 +        /* try to extract temperature */
 +        ptr = strstr(subtitle, "T =");
 +        if (ptr != NULL)
 +        {
 +            ptr += 3;
 +            if (sscanf(ptr, "%lf", &ba->temp) == 1)
 +            {
 +                if (ba->temp <= 0)
 +                {
-                     gmx_fatal(FARGS, "Found temperature of %g in file '%s'",
++                    gmx_fatal(FARGS, "Found temperature of %f in file '%s'",
 +                              ba->temp, fn);
 +                }
 +            }
 +        }
 +    }
 +    if (ba->temp < 0)
 +    {
 +        if (*temp <= 0)
 +        {
 +            gmx_fatal(FARGS, "Did not find a temperature in the subtitle in file '%s', use the -temp option of [TT]g_bar[tt]", fn);
 +        }
 +        ba->temp = *temp;
 +    }
 +
 +    /* Try to deduce lambda from the subtitle */
 +    if (subtitle)
 +    {
 +        if (subtitle2lambda(subtitle, ba, fn, lc))
 +        {
 +            native_lambda_read = TRUE;
 +        }
 +    }
 +    snew(ba->lambda, ba->nset);
 +    if (legend == NULL)
 +    {
 +        /* Check if we have a single set, no legend, nset=1 means t and dH/dl */
 +        if (ba->nset == 1)
 +        {
 +            if (!native_lambda_read)
 +            {
 +                /* Deduce lambda from the file name */
 +                filename2lambda(fn, ba);
 +                native_lambda_read = TRUE;
 +            }
 +            ba->lambda[0] = ba->native_lambda;
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS, "File %s contains multiple sets but no legends, can not determine the lambda values", fn);
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < ba->nset; )
 +        {
 +            gmx_bool use = FALSE;
 +            /* Read lambda from the legend */
 +            lambda_vec_init( &(ba->lambda[i]), lc );
 +            lambda_vec_copy( &(ba->lambda[i]), &(ba->native_lambda));
 +            use = legend2lambda(fn, legend[i], ba, &(ba->lambda[i]));
 +            if (use)
 +            {
 +                lambda_vec_print(&(ba->lambda[i]), buf, FALSE);
 +                i++;
 +            }
 +            else
 +            {
 +                int j;
 +                printf("%s: Ignoring set '%s'.\n", fn, legend[i]);
 +                for (j = i+1; j < ba->nset; j++)
 +                {
 +                    ba->y[j-1]  = ba->y[j];
 +                    legend[j-1] = legend[j];
 +                }
 +                ba->nset--;
 +            }
 +        }
 +    }
 +
 +    if (!native_lambda_read)
 +    {
 +        gmx_fatal(FARGS, "File %s contains multiple sets but no indication of the native lambda", fn);
 +    }
 +
 +    if (legend != NULL)
 +    {
 +        for (i = 0; i < ba->nset-1; i++)
 +        {
 +            sfree(legend[i]);
 +        }
 +        sfree(legend);
 +    }
 +}
 +
 +static void read_bar_xvg(char *fn, real *temp, sim_data_t *sd)
 +{
 +    xvg_t     *barsim;
 +    samples_t *s;
 +    int        i;
 +    double    *lambda;
 +
 +    snew(barsim, 1);
 +
 +    read_bar_xvg_lowlevel(fn, temp, barsim, &(sd->lc));
 +
 +    if (barsim->nset < 1)
 +    {
 +        gmx_fatal(FARGS, "File '%s' contains fewer than two columns", fn);
 +    }
 +
 +    if (!gmx_within_tol(*temp, barsim->temp, GMX_FLOAT_EPS) && (*temp > 0) )
 +    {
 +        gmx_fatal(FARGS, "Temperature in file %s different from earlier files or setting\n", fn);
 +    }
 +    *temp = barsim->temp;
 +
 +    /* now create a series of samples_t */
 +    snew(s, barsim->nset);
 +    for (i = 0; i < barsim->nset; i++)
 +    {
 +        samples_init(s+i, &(barsim->native_lambda), &(barsim->lambda[i]),
 +                     barsim->temp, lambda_vec_same(&(barsim->native_lambda),
 +                                                   &(barsim->lambda[i])),
 +                     fn);
 +        s[i].du  = barsim->y[i];
 +        s[i].ndu = barsim->np[i];
 +        s[i].t   = barsim->t;
 +
 +        lambda_data_list_insert_sample(sd->lb, s+i);
 +    }
 +    {
 +        char buf[STRLEN];
 +
 +        lambda_vec_print(s[0].native_lambda, buf, FALSE);
 +        printf("%s: %.1f - %.1f; lambda = %s\n    dH/dl & foreign lambdas:\n",
 +               fn, s[0].t[0], s[0].t[s[0].ndu-1], buf);
 +        for (i = 0; i < barsim->nset; i++)
 +        {
 +            lambda_vec_print(s[i].foreign_lambda, buf, TRUE);
 +            printf("        %s (%d pts)\n", buf, s[i].ndu);
 +        }
 +    }
 +    printf("\n\n");
 +}
 +
 +static void read_edr_rawdh_block(samples_t **smp, int *ndu, t_enxblock *blk,
 +                                 double start_time, double delta_time,
 +                                 lambda_vec_t *native_lambda, double temp,
 +                                 double *last_t, const char *filename)
 +{
 +    int           i, j;
 +    gmx_bool      allocated;
 +    double        old_foreign_lambda;
 +    lambda_vec_t *foreign_lambda;
 +    int           type;
 +    samples_t    *s; /* convenience pointer */
 +    int           startj;
 +
 +    /* check the block types etc. */
 +    if ( (blk->nsub < 3) ||
 +         (blk->sub[0].type != xdr_datatype_int) ||
 +         (blk->sub[1].type != xdr_datatype_double) ||
 +         (
 +             (blk->sub[2].type != xdr_datatype_float) &&
 +             (blk->sub[2].type != xdr_datatype_double)
 +         ) ||
 +         (blk->sub[0].nr < 1) ||
 +         (blk->sub[1].nr < 1) )
 +    {
 +        gmx_fatal(FARGS,
-                   "Unexpected/corrupted block data in file %s around time %g.",
++                  "Unexpected/corrupted block data in file %s around time %f.",
 +                  filename, start_time);
 +    }
 +
 +    snew(foreign_lambda, 1);
 +    lambda_vec_init(foreign_lambda, native_lambda->lc);
 +    lambda_vec_copy(foreign_lambda, native_lambda);
 +    type = blk->sub[0].ival[0];
 +    if (type == dhbtDH)
 +    {
 +        for (i = 0; i < native_lambda->lc->N; i++)
 +        {
 +            foreign_lambda->val[i] = blk->sub[1].dval[i];
 +        }
 +    }
 +    else
 +    {
 +        if (blk->sub[0].nr > 1)
 +        {
 +            foreign_lambda->dhdl = blk->sub[0].ival[1];
 +        }
 +        else
 +        {
 +            foreign_lambda->dhdl = 0;
 +        }
 +    }
 +
 +    if (!*smp)
 +    {
 +        /* initialize the samples structure if it's empty. */
 +        snew(*smp, 1);
 +        samples_init(*smp, native_lambda, foreign_lambda, temp,
 +                     type == dhbtDHDL, filename);
 +        (*smp)->start_time = start_time;
 +        (*smp)->delta_time = delta_time;
 +    }
 +
 +    /* set convenience pointer */
 +    s = *smp;
 +
 +    /* now double check */
 +    if (!lambda_vec_same(s->foreign_lambda, foreign_lambda) )
 +    {
 +        char buf[STRLEN], buf2[STRLEN];
 +        lambda_vec_print(foreign_lambda, buf, FALSE);
 +        lambda_vec_print(s->foreign_lambda, buf2, FALSE);
 +        fprintf(stderr, "Got foreign lambda=%s, expected: %s\n", buf, buf2);
-         gmx_fatal(FARGS, "Corrupted data in file %s around t=%g.",
++        gmx_fatal(FARGS, "Corrupted data in file %s around t=%f.",
 +                  filename, start_time);
 +    }
 +
 +    /* make room for the data */
 +    if (s->ndu_alloc < (size_t)(s->ndu + blk->sub[2].nr) )
 +    {
 +        s->ndu_alloc += (s->ndu_alloc < (size_t)blk->sub[2].nr) ?
 +            blk->sub[2].nr*2 : s->ndu_alloc;
 +        srenew(s->du_alloc, s->ndu_alloc);
 +        s->du = s->du_alloc;
 +    }
 +    startj   = s->ndu;
 +    s->ndu  += blk->sub[2].nr;
 +    s->ntot += blk->sub[2].nr;
 +    *ndu     = blk->sub[2].nr;
 +
 +    /* and copy the data*/
 +    for (j = 0; j < blk->sub[2].nr; j++)
 +    {
 +        if (blk->sub[2].type == xdr_datatype_float)
 +        {
 +            s->du[startj+j] = blk->sub[2].fval[j];
 +        }
 +        else
 +        {
 +            s->du[startj+j] = blk->sub[2].dval[j];
 +        }
 +    }
 +    if (start_time + blk->sub[2].nr*delta_time > *last_t)
 +    {
 +        *last_t = start_time + blk->sub[2].nr*delta_time;
 +    }
 +}
 +
 +static samples_t *read_edr_hist_block(int *nsamples, t_enxblock *blk,
 +                                      double start_time, double delta_time,
 +                                      lambda_vec_t *native_lambda, double temp,
 +                                      double *last_t, const char *filename)
 +{
 +    int           i, j;
 +    samples_t    *s;
 +    int           nhist;
 +    double        old_foreign_lambda;
 +    lambda_vec_t *foreign_lambda;
 +    int           type;
 +    int           nbins[2];
 +
 +    /* check the block types etc. */
 +    if ( (blk->nsub < 2) ||
 +         (blk->sub[0].type != xdr_datatype_double) ||
 +         (blk->sub[1].type != xdr_datatype_large_int) ||
 +         (blk->sub[0].nr < 2)  ||
 +         (blk->sub[1].nr < 2) )
 +    {
 +        gmx_fatal(FARGS,
-                   "Unexpected/corrupted block data in file %s around time %g",
++                  "Unexpected/corrupted block data in file %s around time %f",
 +                  filename, start_time);
 +    }
 +
 +    nhist = blk->nsub-2;
 +    if (nhist == 0)
 +    {
 +        return NULL;
 +    }
 +    if (nhist > 2)
 +    {
 +        gmx_fatal(FARGS,
-                   "Unexpected/corrupted block data in file %s around time %g",
++                  "Unexpected/corrupted block data in file %s around time %f",
 +                  filename, start_time);
 +    }
 +
 +    snew(s, 1);
 +    *nsamples = 1;
 +
 +    snew(foreign_lambda, 1);
 +    lambda_vec_init(foreign_lambda, native_lambda->lc);
 +    lambda_vec_copy(foreign_lambda, native_lambda);
 +    type = (int)(blk->sub[1].lval[1]);
 +    if (type == dhbtDH)
 +    {
 +        double old_foreign_lambda;
 +
 +        old_foreign_lambda = blk->sub[0].dval[0];
 +        if (old_foreign_lambda >= 0)
 +        {
 +            foreign_lambda->val[0] = old_foreign_lambda;
 +            if (foreign_lambda->lc->N > 1)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Single-component lambda in multi-component file %s",
 +                          filename);
 +            }
 +        }
 +        else
 +        {
 +            for (i = 0; i < native_lambda->lc->N; i++)
 +            {
 +                foreign_lambda->val[i] = blk->sub[0].dval[i+2];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        if (foreign_lambda->lc->N > 1)
 +        {
 +            if (blk->sub[1].nr < 3 + nhist)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Missing derivative coord in multi-component file %s",
 +                          filename);
 +            }
 +            foreign_lambda->dhdl = blk->sub[1].lval[2 + nhist];
 +        }
 +        else
 +        {
 +            foreign_lambda->dhdl = 0;
 +        }
 +    }
 +
 +    samples_init(s, native_lambda, foreign_lambda, temp, type == dhbtDHDL,
 +                 filename);
 +    snew(s->hist, 1);
 +
 +    for (i = 0; i < nhist; i++)
 +    {
 +        nbins[i] = blk->sub[i+2].nr;
 +    }
 +
 +    hist_init(s->hist, nhist, nbins);
 +
 +    for (i = 0; i < nhist; i++)
 +    {
 +        s->hist->x0[i] = blk->sub[1].lval[2+i];
 +        s->hist->dx[i] = blk->sub[0].dval[1];
 +        if (i == 1)
 +        {
 +            s->hist->dx[i] = -s->hist->dx[i];
 +        }
 +    }
 +
 +    s->hist->start_time = start_time;
 +    s->hist->delta_time = delta_time;
 +    s->start_time       = start_time;
 +    s->delta_time       = delta_time;
 +
 +    for (i = 0; i < nhist; i++)
 +    {
 +        int             nbin;
 +        gmx_large_int_t sum = 0;
 +
 +        for (j = 0; j < s->hist->nbin[i]; j++)
 +        {
 +            int binv = (int)(blk->sub[i+2].ival[j]);
 +
 +            s->hist->bin[i][j] = binv;
 +            sum               += binv;
 +
 +        }
 +        if (i == 0)
 +        {
 +            s->ntot      = sum;
 +            s->hist->sum = sum;
 +        }
 +        else
 +        {
 +            if (s->ntot != sum)
 +            {
 +                gmx_fatal(FARGS, "Histogram counts don't match in %s",
 +                          filename);
 +            }
 +        }
 +    }
 +
 +    if (start_time + s->hist->sum*delta_time > *last_t)
 +    {
 +        *last_t = start_time + s->hist->sum*delta_time;
 +    }
 +    return s;
 +}
 +
 +
 +static void read_barsim_edr(char *fn, real *temp, sim_data_t *sd)
 +{
 +    int            i, j;
 +    ener_file_t    fp;
 +    t_enxframe    *fr;
 +    int            nre;
 +    gmx_enxnm_t   *enm           = NULL;
 +    double         first_t       = -1;
 +    double         last_t        = -1;
 +    samples_t    **samples_rawdh = NULL; /* contains samples for raw delta_h  */
 +    int           *nhists        = NULL; /* array to keep count & print at end */
 +    int           *npts          = NULL; /* array to keep count & print at end */
 +    lambda_vec_t **lambdas       = NULL; /* array to keep count & print at end */
 +    lambda_vec_t  *native_lambda;
 +    double         end_time;             /* the end time of the last batch of samples */
 +    int            nsamples = 0;
 +    lambda_vec_t   start_lambda;
 +
 +    fp = open_enx(fn, "r");
 +    do_enxnms(fp, &nre, &enm);
 +    snew(fr, 1);
 +
 +    snew(native_lambda, 1);
 +    start_lambda.lc = NULL;
 +
 +    while (do_enx(fp, fr))
 +    {
 +        /* count the data blocks */
 +        int    nblocks_raw  = 0;
 +        int    nblocks_hist = 0;
 +        int    nlam         = 0;
 +        int    k;
 +        /* DHCOLL block information: */
 +        double start_time = 0, delta_time = 0, old_start_lambda = 0, delta_lambda = 0;
 +        double rtemp      = 0;
 +
 +        /* count the blocks and handle collection information: */
 +        for (i = 0; i < fr->nblock; i++)
 +        {
 +            if (fr->block[i].id == enxDHHIST)
 +            {
 +                nblocks_hist++;
 +            }
 +            if (fr->block[i].id == enxDH)
 +            {
 +                nblocks_raw++;
 +            }
 +            if (fr->block[i].id == enxDHCOLL)
 +            {
 +                nlam++;
 +                if ( (fr->block[i].nsub < 1) ||
 +                     (fr->block[i].sub[0].type != xdr_datatype_double) ||
 +                     (fr->block[i].sub[0].nr < 5))
 +                {
 +                    gmx_fatal(FARGS, "Unexpected block data in file %s", fn);
 +                }
 +
 +                /* read the data from the DHCOLL block */
 +                rtemp            =        fr->block[i].sub[0].dval[0];
 +                start_time       =   fr->block[i].sub[0].dval[1];
 +                delta_time       =   fr->block[i].sub[0].dval[2];
 +                old_start_lambda = fr->block[i].sub[0].dval[3];
 +                delta_lambda     = fr->block[i].sub[0].dval[4];
 +
 +                if (delta_lambda > 0)
 +                {
 +                    gmx_fatal(FARGS, "Lambda values not constant in %s: can't apply BAR method", fn);
 +                }
 +                if ( ( *temp != rtemp) && (*temp > 0) )
 +                {
 +                    gmx_fatal(FARGS, "Temperature in file %s different from earlier files or setting\n", fn);
 +                }
 +                *temp = rtemp;
 +
 +                if (old_start_lambda >= 0)
 +                {
 +                    if (sd->lc.N > 0)
 +                    {
 +                        if (!lambda_components_check(&(sd->lc), 0, "", 0))
 +                        {
 +                            gmx_fatal(FARGS,
 +                                      "lambda vector components in %s don't match those previously read",
 +                                      fn);
 +                        }
 +                    }
 +                    else
 +                    {
 +                        lambda_components_add(&(sd->lc), "", 0);
 +                    }
 +                    if (!start_lambda.lc)
 +                    {
 +                        lambda_vec_init(&start_lambda, &(sd->lc));
 +                    }
 +                    start_lambda.val[0] = old_start_lambda;
 +                }
 +                else
 +                {
 +                    /* read lambda vector */
 +                    int      n_lambda_vec;
 +                    gmx_bool check = (sd->lc.N > 0);
 +                    if (fr->block[i].nsub < 2)
 +                    {
 +                        gmx_fatal(FARGS,
-                                   "No lambda vector, but start_lambda=%g\n",
++                                  "No lambda vector, but start_lambda=%f\n",
 +                                  old_start_lambda);
 +                    }
 +                    n_lambda_vec = fr->block[i].sub[1].ival[1];
 +                    for (j = 0; j < n_lambda_vec; j++)
 +                    {
 +                        const char *name =
 +                            efpt_singular_names[fr->block[i].sub[1].ival[1+j]];
 +                        if (check)
 +                        {
 +                            /* check the components */
 +                            lambda_components_check(&(sd->lc), j, name,
 +                                                    strlen(name));
 +                        }
 +                        else
 +                        {
 +                            lambda_components_add(&(sd->lc), name,
 +                                                  strlen(name));
 +                        }
 +                    }
 +                    lambda_vec_init(&start_lambda, &(sd->lc));
 +                    start_lambda.index = fr->block[i].sub[1].ival[0];
 +                    for (j = 0; j < n_lambda_vec; j++)
 +                    {
 +                        start_lambda.val[j] = fr->block[i].sub[0].dval[5+j];
 +                    }
 +                }
 +                if (first_t < 0)
 +                {
 +                    first_t = start_time;
 +                }
 +            }
 +        }
 +
 +        if (nlam != 1)
 +        {
 +            gmx_fatal(FARGS, "Did not find delta H information in file %s", fn);
 +        }
 +        if (nblocks_raw > 0 && nblocks_hist > 0)
 +        {
 +            gmx_fatal(FARGS, "Can't handle both raw delta U data and histograms in the same file %s", fn);
 +        }
 +
 +        if (nsamples > 0)
 +        {
 +            /* check the native lambda */
 +            if (!lambda_vec_same(&start_lambda, native_lambda) )
 +            {
-                 gmx_fatal(FARGS, "Native lambda not constant in file %s: started at %g, and becomes %g at time %g",
++                gmx_fatal(FARGS, "Native lambda not constant in file %s: started at %f, and becomes %f at time %f",
 +                          fn, native_lambda, start_lambda, start_time);
 +            }
 +            /* check the number of samples against the previous number */
 +            if ( ((nblocks_raw+nblocks_hist) != nsamples) || (nlam != 1 ) )
 +            {
 +                gmx_fatal(FARGS, "Unexpected block count in %s: was %d, now %d\n",
 +                          fn, nsamples+1, nblocks_raw+nblocks_hist+nlam);
 +            }
 +            /* check whether last iterations's end time matches with
 +               the currrent start time */
 +            if ( (fabs(last_t - start_time) > 2*delta_time)  && last_t >= 0)
 +            {
 +                /* it didn't. We need to store our samples and reallocate */
 +                for (i = 0; i < nsamples; i++)
 +                {
 +                    if (samples_rawdh[i])
 +                    {
 +                        /* insert it into the existing list */
 +                        lambda_data_list_insert_sample(sd->lb,
 +                                                       samples_rawdh[i]);
 +                        /* and make sure we'll allocate a new one this time
 +                           around */
 +                        samples_rawdh[i] = NULL;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* this is the first round; allocate the associated data
 +               structures */
 +            /*native_lambda=start_lambda;*/
 +            lambda_vec_init(native_lambda, &(sd->lc));
 +            lambda_vec_copy(native_lambda, &start_lambda);
 +            nsamples = nblocks_raw+nblocks_hist;
 +            snew(nhists, nsamples);
 +            snew(npts, nsamples);
 +            snew(lambdas, nsamples);
 +            snew(samples_rawdh, nsamples);
 +            for (i = 0; i < nsamples; i++)
 +            {
 +                nhists[i]        = 0;
 +                npts[i]          = 0;
 +                lambdas[i]       = NULL;
 +                samples_rawdh[i] = NULL; /* init to NULL so we know which
 +                                            ones contain values */
 +            }
 +        }
 +
 +        /* and read them */
 +        k = 0; /* counter for the lambdas, etc. arrays */
 +        for (i = 0; i < fr->nblock; i++)
 +        {
 +            if (fr->block[i].id == enxDH)
 +            {
 +                int type = (fr->block[i].sub[0].ival[0]);
 +                if (type == dhbtDH || type == dhbtDHDL)
 +                {
 +                    int ndu;
 +                    read_edr_rawdh_block(&(samples_rawdh[k]),
 +                                         &ndu,
 +                                         &(fr->block[i]),
 +                                         start_time, delta_time,
 +                                         native_lambda, rtemp,
 +                                         &last_t, fn);
 +                    npts[k] += ndu;
 +                    if (samples_rawdh[k])
 +                    {
 +                        lambdas[k] = samples_rawdh[k]->foreign_lambda;
 +                    }
 +                    k++;
 +                }
 +            }
 +            else if (fr->block[i].id == enxDHHIST)
 +            {
 +                int type = (int)(fr->block[i].sub[1].lval[1]);
 +                if (type == dhbtDH || type == dhbtDHDL)
 +                {
 +                    int        j;
 +                    int        nb = 0;
 +                    samples_t *s; /* this is where the data will go */
 +                    s = read_edr_hist_block(&nb, &(fr->block[i]),
 +                                            start_time, delta_time,
 +                                            native_lambda, rtemp,
 +                                            &last_t, fn);
 +                    nhists[k] += nb;
 +                    if (nb > 0)
 +                    {
 +                        lambdas[k] = s->foreign_lambda;
 +                    }
 +                    k++;
 +                    /* and insert the new sample immediately */
 +                    for (j = 0; j < nb; j++)
 +                    {
 +                        lambda_data_list_insert_sample(sd->lb, s+j);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    /* Now store all our extant sample collections */
 +    for (i = 0; i < nsamples; i++)
 +    {
 +        if (samples_rawdh[i])
 +        {
 +            /* insert it into the existing list */
 +            lambda_data_list_insert_sample(sd->lb, samples_rawdh[i]);
 +        }
 +    }
 +
 +
 +    {
 +        char buf[STRLEN];
 +        printf("\n");
 +        lambda_vec_print(native_lambda, buf, FALSE);
 +        printf("%s: %.1f - %.1f; lambda = %s\n    foreign lambdas:\n",
 +               fn, first_t, last_t, buf);
 +        for (i = 0; i < nsamples; i++)
 +        {
 +            if (lambdas[i])
 +            {
 +                lambda_vec_print(lambdas[i], buf, TRUE);
 +                if (nhists[i] > 0)
 +                {
 +                    printf("        %s (%d hists)\n", buf, nhists[i]);
 +                }
 +                else
 +                {
 +                    printf("        %s (%d pts)\n", buf, npts[i]);
 +                }
 +            }
 +        }
 +    }
 +    printf("\n\n");
 +    sfree(npts);
 +    sfree(nhists);
 +    sfree(lambdas);
 +}
 +
 +
 +int gmx_bar(int argc, char *argv[])
 +{
 +    static const char *desc[] = {
 +        "[TT]g_bar[tt] calculates free energy difference estimates through ",
 +        "Bennett's acceptance ratio method (BAR). It also automatically",
 +        "adds series of individual free energies obtained with BAR into",
 +        "a combined free energy estimate.[PAR]",
 +
 +        "Every individual BAR free energy difference relies on two ",
 +        "simulations at different states: say state A and state B, as",
 +        "controlled by a parameter, [GRK]lambda[grk] (see the [TT].mdp[tt] parameter",
 +        "[TT]init_lambda[tt]). The BAR method calculates a ratio of weighted",
 +        "average of the Hamiltonian difference of state B given state A and",
 +        "vice versa.",
 +        "The energy differences to the other state must be calculated",
 +        "explicitly during the simulation. This can be done with",
 +        "the [TT].mdp[tt] option [TT]foreign_lambda[tt].[PAR]",
 +
 +        "Input option [TT]-f[tt] expects multiple [TT]dhdl.xvg[tt] files. ",
 +        "Two types of input files are supported:[BR]",
 +        "[TT]*[tt]  Files with more than one [IT]y[it]-value. ",
 +        "The files should have columns ",
 +        "with dH/d[GRK]lambda[grk] and [GRK]Delta[grk][GRK]lambda[grk]. ",
 +        "The [GRK]lambda[grk] values are inferred ",
 +        "from the legends: [GRK]lambda[grk] of the simulation from the legend of ",
 +        "dH/d[GRK]lambda[grk] and the foreign [GRK]lambda[grk] values from the ",
 +        "legends of Delta H",
 +        "[BR]",
 +        "[TT]*[tt]  Files with only one [IT]y[it]-value. Using the",
 +        "[TT]-extp[tt] option for these files, it is assumed",
 +        "that the [IT]y[it]-value is dH/d[GRK]lambda[grk] and that the ",
 +        "Hamiltonian depends linearly on [GRK]lambda[grk]. ",
 +        "The [GRK]lambda[grk] value of the simulation is inferred from the ",
 +        "subtitle (if present), otherwise from a number in the subdirectory ",
 +        "in the file name.[PAR]",
 +
 +        "The [GRK]lambda[grk] of the simulation is parsed from ",
 +        "[TT]dhdl.xvg[tt] file's legend containing the string 'dH', the ",
 +        "foreign [GRK]lambda[grk] values from the legend containing the ",
 +        "capitalized letters 'D' and 'H'. The temperature is parsed from ",
 +        "the legend line containing 'T ='.[PAR]",
 +
 +        "The input option [TT]-g[tt] expects multiple [TT].edr[tt] files. ",
 +        "These can contain either lists of energy differences (see the ",
 +        "[TT].mdp[tt] option [TT]separate_dhdl_file[tt]), or a series of ",
 +        "histograms (see the [TT].mdp[tt] options [TT]dh_hist_size[tt] and ",
 +        "[TT]dh_hist_spacing[tt]).", "The temperature and [GRK]lambda[grk] ",
 +        "values are automatically deduced from the [TT]ener.edr[tt] file.[PAR]",
 +
 +        "In addition to the [TT].mdp[tt] option [TT]foreign_lambda[tt], ",
 +        "the energy difference can also be extrapolated from the ",
 +        "dH/d[GRK]lambda[grk] values. This is done with the[TT]-extp[tt]",
 +        "option, which assumes that the system's Hamiltonian depends linearly",
 +        "on [GRK]lambda[grk], which is not normally the case.[PAR]",
 +
 +        "The free energy estimates are determined using BAR with bisection, ",
 +        "with the precision of the output set with [TT]-prec[tt]. ",
 +        "An error estimate taking into account time correlations ",
 +        "is made by splitting the data into blocks and determining ",
 +        "the free energy differences over those blocks and assuming ",
 +        "the blocks are independent. ",
 +        "The final error estimate is determined from the average variance ",
 +        "over 5 blocks. A range of block numbers for error estimation can ",
 +        "be provided with the options [TT]-nbmin[tt] and [TT]-nbmax[tt].[PAR]",
 +
 +        "[TT]g_bar[tt] tries to aggregate samples with the same 'native' and ",
 +        "'foreign' [GRK]lambda[grk] values, but always assumes independent ",
 +        "samples. [BB]Note[bb] that when aggregating energy ",
 +        "differences/derivatives with different sampling intervals, this is ",
 +        "almost certainly not correct. Usually subsequent energies are ",
 +        "correlated and different time intervals mean different degrees ",
 +        "of correlation between samples.[PAR]",
 +
 +        "The results are split in two parts: the last part contains the final ",
 +        "results in kJ/mol, together with the error estimate for each part ",
 +        "and the total. The first part contains detailed free energy ",
 +        "difference estimates and phase space overlap measures in units of ",
 +        "kT (together with their computed error estimate). The printed ",
 +        "values are:[BR]",
 +        "[TT]*[tt]  lam_A: the [GRK]lambda[grk] values for point A.[BR]",
 +        "[TT]*[tt]  lam_B: the [GRK]lambda[grk] values for point B.[BR]",
 +        "[TT]*[tt]     DG: the free energy estimate.[BR]",
 +        "[TT]*[tt]    s_A: an estimate of the relative entropy of B in A.[BR]",
-         "[TT]*[tt]    s_A: an estimate of the relative entropy of A in B.[BR]",
++        "[TT]*[tt]    s_B: an estimate of the relative entropy of A in B.[BR]",
 +        "[TT]*[tt]  stdev: an estimate expected per-sample standard deviation.[PAR]",
 +
 +        "The relative entropy of both states in each other's ensemble can be ",
 +        "interpreted as a measure of phase space overlap: ",
 +        "the relative entropy s_A of the work samples of lambda_B in the ",
 +        "ensemble of lambda_A (and vice versa for s_B), is a ",
 +        "measure of the 'distance' between Boltzmann distributions of ",
 +        "the two states, that goes to zero for identical distributions. See ",
 +        "Wu & Kofke, J. Chem. Phys. 123 084109 (2005) for more information.",
 +        "[PAR]",
 +        "The estimate of the expected per-sample standard deviation, as given ",
 +        "in Bennett's original BAR paper: Bennett, J. Comp. Phys. 22, p 245 (1976).",
 +        "Eq. 10 therein gives an estimate of the quality of sampling (not directly",
 +        "of the actual statistical error, because it assumes independent samples).[PAR]",
 +
 +        "To get a visual estimate of the phase space overlap, use the ",
 +        "[TT]-oh[tt] option to write series of histograms, together with the ",
 +        "[TT]-nbin[tt] option.[PAR]"
 +    };
 +    static real        begin    = 0, end = -1, temp = -1;
 +    int                nd       = 2, nbmin = 5, nbmax = 5;
 +    int                nbin     = 100;
 +    gmx_bool           use_dhdl = FALSE;
 +    gmx_bool           calc_s, calc_v;
 +    t_pargs            pa[] = {
 +        { "-b",    FALSE, etREAL, {&begin},  "Begin time for BAR" },
 +        { "-e",    FALSE, etREAL, {&end},    "End time for BAR" },
 +        { "-temp", FALSE, etREAL, {&temp},   "Temperature (K)" },
 +        { "-prec", FALSE, etINT,  {&nd},     "The number of digits after the decimal point" },
 +        { "-nbmin",  FALSE, etINT,  {&nbmin}, "Minimum number of blocks for error estimation" },
 +        { "-nbmax",  FALSE, etINT,  {&nbmax}, "Maximum number of blocks for error estimation" },
 +        { "-nbin",  FALSE, etINT, {&nbin}, "Number of bins for histogram output"},
 +        { "-extp",  FALSE, etBOOL, {&use_dhdl}, "Whether to linearly extrapolate dH/dl values to use as energies"}
 +    };
 +
 +    t_filenm           fnm[] = {
 +        { efXVG, "-f",  "dhdl",   ffOPTRDMULT },
 +        { efEDR, "-g",  "ener",   ffOPTRDMULT },
 +        { efXVG, "-o",  "bar",    ffOPTWR },
 +        { efXVG, "-oi", "barint", ffOPTWR },
 +        { efXVG, "-oh", "histogram", ffOPTWR }
 +    };
 +#define NFILE asize(fnm)
 +
 +    int          f, i, j;
 +    int          nf = 0;    /* file counter */
 +    int          nbs;
 +    int          nfile_tot; /* total number of input files */
 +    int          nxvgfile = 0;
 +    int          nedrfile = 0;
 +    char       **fxvgnms;
 +    char       **fedrnms;
 +    sim_data_t   sim_data; /* the simulation data */
 +    barres_t    *results;  /* the results */
 +    int          nresults; /* number of results in results array */
 +
 +    double      *partsum;
 +    double       prec, dg_tot, dg, sig, dg_tot_max, dg_tot_min;
 +    FILE        *fpb, *fpi;
 +    char         dgformat[20], xvg2format[STRLEN], xvg3format[STRLEN];
 +    char         buf[STRLEN], buf2[STRLEN];
 +    char         ktformat[STRLEN], sktformat[STRLEN];
 +    char         kteformat[STRLEN], skteformat[STRLEN];
 +    output_env_t oenv;
 +    double       kT, beta;
 +    gmx_bool     result_OK = TRUE, bEE = TRUE;
 +
 +    gmx_bool     disc_err          = FALSE;
 +    double       sum_disc_err      = 0.; /* discretization error */
 +    gmx_bool     histrange_err     = FALSE;
 +    double       sum_histrange_err = 0.; /* histogram range error */
 +    double       stat_err          = 0.; /* statistical error */
 +
 +    parse_common_args(&argc, argv,
 +                      PCA_CAN_VIEW,
 +                      NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL, &oenv);
 +
 +    if (opt2bSet("-f", NFILE, fnm))
 +    {
 +        nxvgfile = opt2fns(&fxvgnms, "-f", NFILE, fnm);
 +    }
 +    if (opt2bSet("-g", NFILE, fnm))
 +    {
 +        nedrfile = opt2fns(&fedrnms, "-g", NFILE, fnm);
 +    }
 +
 +    sim_data_init(&sim_data);
 +#if 0
 +    /* make linked list */
 +    lb = &lambda_head;
 +    lambda_data_init(lb, 0, 0);
 +    lb->next = lb;
 +    lb->prev = lb;
 +#endif
 +
 +
 +    nfile_tot = nxvgfile + nedrfile;
 +
 +    if (nfile_tot == 0)
 +    {
 +        gmx_fatal(FARGS, "No input files!");
 +    }
 +
 +    if (nd < 0)
 +    {
 +        gmx_fatal(FARGS, "Can not have negative number of digits");
 +    }
 +    prec = pow(10, -nd);
 +
 +    snew(partsum, (nbmax+1)*(nbmax+1));
 +    nf = 0;
 +
 +    /* read in all files. First xvg files */
 +    for (f = 0; f < nxvgfile; f++)
 +    {
 +        read_bar_xvg(fxvgnms[f], &temp, &sim_data);
 +        nf++;
 +    }
 +    /* then .edr files */
 +    for (f = 0; f < nedrfile; f++)
 +    {
 +        read_barsim_edr(fedrnms[f], &temp, &sim_data);;
 +        nf++;
 +    }
 +
 +    /* fix the times to allow for equilibration */
 +    sim_data_impose_times(&sim_data, begin, end);
 +
 +    if (opt2bSet("-oh", NFILE, fnm))
 +    {
 +        sim_data_histogram(&sim_data, opt2fn("-oh", NFILE, fnm), nbin, oenv);
 +    }
 +
 +    /* assemble the output structures from the lambdas */
 +    results = barres_list_create(&sim_data, &nresults, use_dhdl);
 +
 +    sum_disc_err = barres_list_max_disc_err(results, nresults);
 +
 +    if (nresults == 0)
 +    {
 +        printf("\nNo results to calculate.\n");
 +        return 0;
 +    }
 +
 +    if (sum_disc_err > prec)
 +    {
 +        prec = sum_disc_err;
 +        nd   = ceil(-log10(prec));
 +        printf("WARNING: setting the precision to %g because that is the minimum\n         reasonable number, given the expected discretization error.\n", prec);
 +    }
 +
 +
 +    /*sprintf(lamformat,"%%6.3f");*/
 +    sprintf( dgformat, "%%%d.%df", 3+nd, nd);
 +    /* the format strings of the results in kT */
 +    sprintf( ktformat, "%%%d.%df", 5+nd, nd);
 +    sprintf( sktformat, "%%%ds", 6+nd);
 +    /* the format strings of the errors in kT */
 +    sprintf( kteformat, "%%%d.%df", 3+nd, nd);
 +    sprintf( skteformat, "%%%ds", 4+nd);
 +    sprintf(xvg2format, "%s %s\n", "%s", dgformat);
 +    sprintf(xvg3format, "%s %s %s\n", "%s", dgformat, dgformat);
 +
 +
 +
 +    fpb = NULL;
 +    if (opt2bSet("-o", NFILE, fnm))
 +    {
 +        sprintf(buf, "%s (%s)", "\\DeltaG", "kT");
 +        fpb = xvgropen_type(opt2fn("-o", NFILE, fnm), "Free energy differences",
 +                            "\\lambda", buf, exvggtXYDY, oenv);
 +    }
 +
 +    fpi = NULL;
 +    if (opt2bSet("-oi", NFILE, fnm))
 +    {
 +        sprintf(buf, "%s (%s)", "\\DeltaG", "kT");
 +        fpi = xvgropen(opt2fn("-oi", NFILE, fnm), "Free energy integral",
 +                       "\\lambda", buf, oenv);
 +    }
 +
 +
 +
 +    if (nbmin > nbmax)
 +    {
 +        nbmin = nbmax;
 +    }
 +
 +    /* first calculate results */
 +    bEE      = TRUE;
 +    disc_err = FALSE;
 +    for (f = 0; f < nresults; f++)
 +    {
 +        /* Determine the free energy difference with a factor of 10
 +         * more accuracy than requested for printing.
 +         */
 +        calc_bar(&(results[f]), 0.1*prec, nbmin, nbmax,
 +                 &bEE, partsum);
 +
 +        if (results[f].dg_disc_err > prec/10.)
 +        {
 +            disc_err = TRUE;
 +        }
 +        if (results[f].dg_histrange_err > prec/10.)
 +        {
 +            histrange_err = TRUE;
 +        }
 +    }
 +
 +    /* print results in kT */
 +    kT   = BOLTZ*temp;
 +    beta = 1/kT;
 +
 +    printf("\nTemperature: %g K\n", temp);
 +
 +    printf("\nDetailed results in kT (see help for explanation):\n\n");
 +    printf("%6s ", " lam_A");
 +    printf("%6s ", " lam_B");
 +    printf(sktformat,  "DG ");
 +    if (bEE)
 +    {
 +        printf(skteformat, "+/- ");
 +    }
 +    if (disc_err)
 +    {
 +        printf(skteformat, "disc ");
 +    }
 +    if (histrange_err)
 +    {
 +        printf(skteformat, "range ");
 +    }
 +    printf(sktformat,  "s_A ");
 +    if (bEE)
 +    {
 +        printf(skteformat, "+/- " );
 +    }
 +    printf(sktformat,  "s_B ");
 +    if (bEE)
 +    {
 +        printf(skteformat, "+/- " );
 +    }
 +    printf(sktformat,  "stdev ");
 +    if (bEE)
 +    {
 +        printf(skteformat, "+/- ");
 +    }
 +    printf("\n");
 +    for (f = 0; f < nresults; f++)
 +    {
 +        lambda_vec_print_short(results[f].a->native_lambda, buf);
 +        printf("%s ", buf);
 +        lambda_vec_print_short(results[f].b->native_lambda, buf);
 +        printf("%s ", buf);
 +        printf(ktformat,  results[f].dg);
 +        printf(" ");
 +        if (bEE)
 +        {
 +            printf(kteformat, results[f].dg_err);
 +            printf(" ");
 +        }
 +        if (disc_err)
 +        {
 +            printf(kteformat, results[f].dg_disc_err);
 +            printf(" ");
 +        }
 +        if (histrange_err)
 +        {
 +            printf(kteformat, results[f].dg_histrange_err);
 +            printf(" ");
 +        }
 +        printf(ktformat,  results[f].sa);
 +        printf(" ");
 +        if (bEE)
 +        {
 +            printf(kteformat, results[f].sa_err);
 +            printf(" ");
 +        }
 +        printf(ktformat,  results[f].sb);
 +        printf(" ");
 +        if (bEE)
 +        {
 +            printf(kteformat, results[f].sb_err);
 +            printf(" ");
 +        }
 +        printf(ktformat,  results[f].dg_stddev);
 +        printf(" ");
 +        if (bEE)
 +        {
 +            printf(kteformat, results[f].dg_stddev_err);
 +        }
 +        printf("\n");
 +
 +        /* Check for negative relative entropy with a 95% certainty. */
 +        if (results[f].sa < -2*results[f].sa_err ||
 +            results[f].sb < -2*results[f].sb_err)
 +        {
 +            result_OK = FALSE;
 +        }
 +    }
 +
 +    if (!result_OK)
 +    {
 +        printf("\nWARNING: Some of these results violate the Second Law of "
 +               "Thermodynamics: \n"
 +               "         This is can be the result of severe undersampling, or "
 +               "(more likely)\n"
 +               "         there is something wrong with the simulations.\n");
 +    }
 +
 +
 +    /* final results in kJ/mol */
 +    printf("\n\nFinal results in kJ/mol:\n\n");
 +    dg_tot  = 0;
 +    for (f = 0; f < nresults; f++)
 +    {
 +
 +        if (fpi != NULL)
 +        {
 +            lambda_vec_print_short(results[f].a->native_lambda, buf);
 +            fprintf(fpi, xvg2format, buf, dg_tot);
 +        }
 +
 +
 +        if (fpb != NULL)
 +        {
 +            lambda_vec_print_intermediate(results[f].a->native_lambda,
 +                                          results[f].b->native_lambda,
 +                                          buf);
 +
 +            fprintf(fpb, xvg3format, buf, results[f].dg, results[f].dg_err);
 +        }
 +
 +        printf("point ");
 +        lambda_vec_print_short(results[f].a->native_lambda, buf);
 +        lambda_vec_print_short(results[f].b->native_lambda, buf2);
 +        printf("%s - %s", buf, buf2);
 +        printf(",   DG ");
 +
 +        printf(dgformat, results[f].dg*kT);
 +        if (bEE)
 +        {
 +            printf(" +/- ");
 +            printf(dgformat, results[f].dg_err*kT);
 +        }
 +        if (histrange_err)
 +        {
 +            printf(" (max. range err. = ");
 +            printf(dgformat, results[f].dg_histrange_err*kT);
 +            printf(")");
 +            sum_histrange_err += results[f].dg_histrange_err*kT;
 +        }
 +
 +        printf("\n");
 +        dg_tot += results[f].dg;
 +    }
 +    printf("\n");
 +    printf("total ");
 +    lambda_vec_print_short(results[0].a->native_lambda, buf);
 +    lambda_vec_print_short(results[nresults-1].b->native_lambda, buf2);
 +    printf("%s - %s", buf, buf2);
 +    printf(",   DG ");
 +
 +    printf(dgformat, dg_tot*kT);
 +    if (bEE)
 +    {
 +        stat_err = bar_err(nbmin, nbmax, partsum)*kT;
 +        printf(" +/- ");
 +        printf(dgformat, max(max(stat_err, sum_disc_err), sum_histrange_err));
 +    }
 +    printf("\n");
 +    if (disc_err)
 +    {
 +        printf("\nmaximum discretization error = ");
 +        printf(dgformat, sum_disc_err);
 +        if (bEE && stat_err < sum_disc_err)
 +        {
 +            printf("WARNING: discretization error (%g) is larger than statistical error.\n       Decrease histogram spacing for more accurate results\n", stat_err);
 +        }
 +    }
 +    if (histrange_err)
 +    {
 +        printf("\nmaximum histogram range error = ");
 +        printf(dgformat, sum_histrange_err);
 +        if (bEE && stat_err < sum_histrange_err)
 +        {
 +            printf("WARNING: histogram range error (%g) is larger than statistical error.\n       Increase histogram range for more accurate results\n", stat_err);
 +        }
 +
 +    }
 +    printf("\n");
 +
 +
 +    if (fpi != NULL)
 +    {
 +        lambda_vec_print_short(results[nresults-1].b->native_lambda, buf);
 +        fprintf(fpi, xvg2format, buf, dg_tot);
 +        ffclose(fpi);
 +    }
 +
 +    do_view(oenv, opt2fn_null("-o", NFILE, fnm), "-xydy");
 +    do_view(oenv, opt2fn_null("-oi", NFILE, fnm), "-xydy");
 +
 +    thanx(stderr);
 +
 +    return 0;
 +}
diff --cc src/gromacs/gmxana/gmx_genion.c
index 4a24ea2038,0000000000..2302ac8f1d
mode 100644,000000..100644
--- a/src/gromacs/gmxana/gmx_genion.c
+++ b/src/gromacs/gmxana/gmx_genion.c
@@@ -1,661 -1,0 +1,559 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Green Red Orange Magenta Azure Cyan Skyblue
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include "copyrite.h"
 +#include "string2.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "confio.h"
 +#include "statutil.h"
 +#include "pbc.h"
 +#include "force.h"
 +#include "gmx_fatal.h"
 +#include "futil.h"
 +#include "maths.h"
 +#include "macros.h"
- #include "physics.h"
 +#include "vec.h"
 +#include "tpxio.h"
 +#include "mdrun.h"
- #include "calcpot.h"
 +#include "main.h"
 +#include "random.h"
 +#include "index.h"
 +#include "mtop_util.h"
 +#include "gmx_ana.h"
 +
 +static int greatest_common_divisor(int p, int q)
 +{
 +    int tmp;
 +    while (q != 0)
 +    {
 +        tmp = q;
 +        q = p % q;
 +        p = tmp;
 +    }
 +    return p;
 +}
 +
 +static void insert_ion(int nsa, int *nwater,
 +                       gmx_bool bSet[], int repl[], atom_id index[],
-                        real pot[], rvec x[], t_pbc *pbc,
++                       rvec x[], t_pbc *pbc,
 +                       int sign, int q, const char *ionname,
-                        t_mdatoms *mdatoms,
-                        real rmin, gmx_bool bRandom, int *seed)
++                       t_atoms *atoms,
++                       real rmin, int *seed)
 +{
-     int             i, ii, ei, owater, wlast, m, nw;
-     real            extr_e, poti, rmin2;
-     rvec            xei, dx;
-     gmx_bool        bSub = FALSE;
++    int             i, ei,nw;
++    real            rmin2;
++    rvec            dx;
 +    gmx_large_int_t maxrand;
 +
 +    ei       = -1;
 +    nw       = *nwater;
 +    maxrand  = nw;
 +    maxrand *= 1000;
-     if (bRandom)
++
++    do
 +    {
-         do
-         {
-             ei = nw*rando(seed);
-             maxrand--;
-         }
-         while (bSet[ei] && (maxrand > 0));
-         if (bSet[ei])
-         {
-             gmx_fatal(FARGS, "No more replaceable solvent!");
-         }
++        ei = nw*rando(seed);
++        maxrand--;
 +    }
-     else
++    while (bSet[ei] && (maxrand > 0));
++    if (bSet[ei])
 +    {
-         extr_e = 0;
-         for (i = 0; (i < nw); i++)
-         {
-             if (!bSet[i])
-             {
-                 ii   = index[nsa*i];
-                 poti = pot[ii];
-                 if (q > 0)
-                 {
-                     if ((poti <= extr_e) || !bSub)
-                     {
-                         extr_e = poti;
-                         ei     = i;
-                         bSub   = TRUE;
-                     }
-                 }
-                 else
-                 {
-                     if ((poti >= extr_e) || !bSub)
-                     {
-                         extr_e = poti;
-                         ei     = i;
-                         bSub   = TRUE;
-                     }
-                 }
-             }
-         }
-         if (ei == -1)
-         {
-             gmx_fatal(FARGS, "No more replaceable solvent!");
-         }
++        gmx_fatal(FARGS, "No more replaceable solvent!");
 +    }
++
 +    fprintf(stderr, "Replacing solvent molecule %d (atom %d) with %s\n",
 +            ei, index[nsa*ei], ionname);
 +
 +    /* Replace solvent molecule charges with ion charge */
 +    bSet[ei] = TRUE;
 +    repl[ei] = sign;
-     mdatoms->chargeA[index[nsa*ei]] = q;
++
++    atoms->atom[index[nsa*ei]].q = q;
 +    for (i = 1; i < nsa; i++)
 +    {
-         mdatoms->chargeA[index[nsa*ei+i]] = 0;
++        atoms->atom[index[nsa*ei+i]].q = 0;
 +    }
 +
 +    /* Mark all solvent molecules within rmin as unavailable for substitution */
 +    if (rmin > 0)
 +    {
 +        rmin2 = rmin*rmin;
 +        for (i = 0; (i < nw); i++)
 +        {
 +            if (!bSet[i])
 +            {
 +                pbc_dx(pbc, x[index[nsa*ei]], x[index[nsa*i]], dx);
 +                if (iprod(dx, dx) < rmin2)
 +                {
 +                    bSet[i] = TRUE;
 +                }
 +            }
 +        }
 +    }
 +}
 +
++
 +static char *aname(const char *mname)
 +{
 +    char *str;
 +    int   i;
 +
 +    str = strdup(mname);
 +    i   = strlen(str)-1;
 +    while (i > 1 && (isdigit(str[i]) || (str[i] == '+') || (str[i] == '-')))
 +    {
 +        str[i] = '\0';
 +        i--;
 +    }
 +
 +    return str;
 +}
 +
 +void sort_ions(int nsa, int nw, int repl[], atom_id index[],
 +               t_atoms *atoms, rvec x[],
 +               const char *p_name, const char *n_name)
 +{
 +    int    i, j, k, r, np, nn, starta, startr, npi, nni;
 +    rvec  *xt;
 +    char **pptr = NULL, **nptr = NULL, **paptr = NULL, **naptr = NULL;
 +
 +    snew(xt, atoms->nr);
 +
 +    /* Put all the solvent in front and count the added ions */
 +    np = 0;
 +    nn = 0;
 +    j  = index[0];
 +    for (i = 0; i < nw; i++)
 +    {
 +        r = repl[i];
 +        if (r == 0)
 +        {
 +            for (k = 0; k < nsa; k++)
 +            {
 +                copy_rvec(x[index[nsa*i+k]], xt[j++]);
 +            }
 +        }
 +        else if (r > 0)
 +        {
 +            np++;
 +        }
 +        else if (r < 0)
 +        {
 +            nn++;
 +        }
 +    }
 +
 +    if (np+nn > 0)
 +    {
 +        /* Put the positive and negative ions at the end */
 +        starta = index[nsa*(nw - np - nn)];
 +        startr = atoms->atom[starta].resind;
 +
 +        if (np)
 +        {
 +            snew(pptr, 1);
 +            pptr[0] = strdup(p_name);
 +            snew(paptr, 1);
 +            paptr[0] = aname(p_name);
 +        }
 +        if (nn)
 +        {
 +            snew(nptr, 1);
 +            nptr[0] = strdup(n_name);
 +            snew(naptr, 1);
 +            naptr[0] = aname(n_name);
 +        }
 +        npi = 0;
 +        nni = 0;
 +        for (i = 0; i < nw; i++)
 +        {
 +            r = repl[i];
 +            if (r > 0)
 +            {
 +                j = starta+npi;
 +                k = startr+npi;
 +                copy_rvec(x[index[nsa*i]], xt[j]);
 +                atoms->atomname[j]     = paptr;
 +                atoms->atom[j].resind  = k;
 +                atoms->resinfo[k].name = pptr;
 +                npi++;
 +            }
 +            else if (r < 0)
 +            {
 +                j = starta+np+nni;
 +                k = startr+np+nni;
 +                copy_rvec(x[index[nsa*i]], xt[j]);
 +                atoms->atomname[j]     = naptr;
 +                atoms->atom[j].resind  = k;
 +                atoms->resinfo[k].name = nptr;
 +                nni++;
 +            }
 +        }
 +        for (i = index[nsa*nw-1]+1; i < atoms->nr; i++)
 +        {
 +            j                  = i-(nsa-1)*(np+nn);
 +            atoms->atomname[j] = atoms->atomname[i];
 +            atoms->atom[j]     = atoms->atom[i];
 +            copy_rvec(x[i], xt[j]);
 +        }
 +        atoms->nr -= (nsa-1)*(np+nn);
 +
 +        /* Copy the new positions back */
 +        for (i = index[0]; i < atoms->nr; i++)
 +        {
 +            copy_rvec(xt[i], x[i]);
 +        }
 +        sfree(xt);
 +    }
 +}
 +
 +static void update_topol(const char *topinout, int p_num, int n_num,
 +                         const char *p_name, const char *n_name, char *grpname)
 +{
 +#define TEMP_FILENM "temp.top"
 +    FILE    *fpin, *fpout;
 +    char     buf[STRLEN], buf2[STRLEN], *temp, **mol_line = NULL;
 +    int      line, i, nsol, nmol_line, sol_line, nsol_last;
 +    gmx_bool bMolecules;
 +
 +    printf("\nProcessing topology\n");
 +    fpin  = ffopen(topinout, "r");
 +    fpout = ffopen(TEMP_FILENM, "w");
 +
 +    line       = 0;
 +    bMolecules = FALSE;
 +    nmol_line  = 0;
 +    sol_line   = -1;
 +    nsol_last  = -1;
 +    while (fgets(buf, STRLEN, fpin))
 +    {
 +        line++;
 +        strcpy(buf2, buf);
 +        if ((temp = strchr(buf2, '\n')) != NULL)
 +        {
 +            temp[0] = '\0';
 +        }
 +        ltrim(buf2);
 +        if (buf2[0] == '[')
 +        {
 +            buf2[0] = ' ';
 +            if ((temp = strchr(buf2, '\n')) != NULL)
 +            {
 +                temp[0] = '\0';
 +            }
 +            rtrim(buf2);
 +            if (buf2[strlen(buf2)-1] == ']')
 +            {
 +                buf2[strlen(buf2)-1] = '\0';
 +                ltrim(buf2);
 +                rtrim(buf2);
 +                bMolecules = (gmx_strcasecmp(buf2, "molecules") == 0);
 +            }
 +            fprintf(fpout, "%s", buf);
 +        }
 +        else if (!bMolecules)
 +        {
 +            fprintf(fpout, "%s", buf);
 +        }
 +        else
 +        {
 +            /* Check if this is a line with solvent molecules */
 +            sscanf(buf, "%s", buf2);
 +            if (gmx_strcasecmp(buf2, grpname) == 0)
 +            {
 +                sol_line = nmol_line;
 +                sscanf(buf, "%*s %d", &nsol_last);
 +            }
 +            /* Store this molecules section line */
 +            srenew(mol_line, nmol_line+1);
 +            mol_line[nmol_line] = strdup(buf);
 +            nmol_line++;
 +        }
 +    }
 +    ffclose(fpin);
 +
 +    if (sol_line == -1)
 +    {
 +        ffclose(fpout);
 +        gmx_fatal(FARGS, "No line with moleculetype '%s' found the [ molecules ] section of file '%s'", grpname, topinout);
 +    }
 +    if (nsol_last < p_num+n_num)
 +    {
 +        ffclose(fpout);
 +        gmx_fatal(FARGS, "The last entry for moleculetype '%s' in the [ molecules ] section of file '%s' has less solvent molecules (%d) than were replaced (%d)", grpname, topinout, nsol_last, p_num+n_num);
 +    }
 +
 +    /* Print all the molecule entries */
 +    for (i = 0; i < nmol_line; i++)
 +    {
 +        if (i != sol_line)
 +        {
 +            fprintf(fpout, "%s", mol_line[i]);
 +        }
 +        else
 +        {
 +            printf("Replacing %d solute molecules in topology file (%s) "
 +                   " by %d %s and %d %s ions.\n",
 +                   p_num+n_num, topinout, p_num, p_name, n_num, n_name);
 +            nsol_last -= p_num + n_num;
 +            if (nsol_last > 0)
 +            {
 +                fprintf(fpout, "%-10s  %d\n", grpname, nsol_last);
 +            }
 +            if (p_num > 0)
 +            {
 +                fprintf(fpout, "%-15s  %d\n", p_name, p_num);
 +            }
 +            if (n_num > 0)
 +            {
 +                fprintf(fpout, "%-15s  %d\n", n_name, n_num);
 +            }
 +        }
 +    }
 +    ffclose(fpout);
 +    /* use ffopen to generate backup of topinout */
 +    fpout = ffopen(topinout, "w");
 +    ffclose(fpout);
 +    rename(TEMP_FILENM, topinout);
 +#undef TEMP_FILENM
 +}
 +
 +int gmx_genion(int argc, char *argv[])
 +{
 +    const char        *desc[] = {
-         "[TT]genion[tt] replaces solvent molecules by monoatomic ions at",
-         "the position of the first atoms with the most favorable electrostatic",
-         "potential or at random. The potential is calculated on all atoms, using",
-         "normal GROMACS particle-based methods (in contrast to other methods",
-         "based on solving the Poisson-Boltzmann equation).",
-         "The potential is recalculated after every ion insertion.",
-         "If specified in the run input file, a reaction field, shift function",
-         "or user function can be used. For the user function a table file",
-         "can be specified with the option [TT]-table[tt].",
++        "[TT]genion[tt] randomly replaces solvent molecules with monoatomic ions.",
 +        "The group of solvent molecules should be continuous and all molecules",
 +        "should have the same number of atoms.",
 +        "The user should add the ion molecules to the topology file or use",
 +        "the [TT]-p[tt] option to automatically modify the topology.[PAR]",
 +        "The ion molecule type, residue and atom names in all force fields",
 +        "are the capitalized element names without sign. This molecule name",
 +        "should be given with [TT]-pname[tt] or [TT]-nname[tt], and the",
 +        "[TT][molecules][tt] section of your topology updated accordingly,",
 +        "either by hand or with [TT]-p[tt]. Do not use an atom name instead!",
 +        "[PAR]Ions which can have multiple charge states get the multiplicity",
 +        "added, without sign, for the uncommon states only.[PAR]",
-         "With the option [TT]-pot[tt] the potential can be written as B-factors",
-         "in a [TT].pdb[tt] file (for visualisation using e.g. Rasmol).",
-         "The unit of the potential is 1000 kJ/(mol e), the scaling be changed",
-         "with the [TT]-scale[tt] option.[PAR]",
 +        "For larger ions, e.g. sulfate we recommended using [TT]genbox[tt]."
 +    };
 +    const char        *bugs[] = {
-         "Calculation of the potential is not reliable, therefore the [TT]-random[tt] option is now turned on by default.",
-         "If you specify a salt concentration existing ions are not taken into account. In effect you therefore specify the amount of salt to be added."
++        "If you specify a salt concentration existing ions are not taken into "
++        "account. In effect you therefore specify the amount of salt to be added.",
 +    };
 +    static int         p_num   = 0, n_num = 0, p_q = 1, n_q = -1;
 +    static const char *p_name  = "NA", *n_name = "CL";
-     static real        rmin    = 0.6, scale = 0.001, conc = 0;
++    static real        rmin    = 0.6, conc = 0;
 +    static int         seed    = 1993;
-     static gmx_bool    bRandom = TRUE, bNeutral = FALSE;
++    static gmx_bool    bNeutral = FALSE;
 +    static t_pargs     pa[]    = {
 +        { "-np",    FALSE, etINT,  {&p_num}, "Number of positive ions"       },
 +        { "-pname", FALSE, etSTR,  {&p_name}, "Name of the positive ion"      },
 +        { "-pq",    FALSE, etINT,  {&p_q},   "Charge of the positive ion"    },
 +        { "-nn",    FALSE, etINT,  {&n_num}, "Number of negative ions"       },
 +        { "-nname", FALSE, etSTR,  {&n_name}, "Name of the negative ion"      },
 +        { "-nq",    FALSE, etINT,  {&n_q},   "Charge of the negative ion"    },
 +        { "-rmin",  FALSE, etREAL, {&rmin},  "Minimum distance between ions" },
-         { "-random", FALSE, etBOOL, {&bRandom}, "Use random placement of ions instead of based on potential. The rmin option should still work" },
 +        { "-seed",  FALSE, etINT,  {&seed},  "Seed for random number generator" },
-         { "-scale", FALSE, etREAL, {&scale}, "Scaling factor for the potential for [TT]-pot[tt]" },
 +        { "-conc",  FALSE, etREAL, {&conc},
 +          "Specify salt concentration (mol/liter). This will add sufficient ions to reach up to the specified concentration as computed from the volume of the cell in the input [TT].tpr[tt] file. Overrides the [TT]-np[tt] and [TT]-nn[tt] options." },
 +        { "-neutral", FALSE, etBOOL, {&bNeutral}, "This option will add enough ions to neutralize the system. These ions are added on top of those specified with [TT]-np[tt]/[TT]-nn[tt] or [TT]-conc[tt]. "}
 +    };
-     gmx_mtop_t        *mtop;
-     gmx_localtop_t    *top;
-     t_inputrec         inputrec;
-     t_commrec         *cr;
-     t_mdatoms         *mdatoms;
-     gmx_enerdata_t     enerd;
-     t_graph           *graph;
-     t_forcerec        *fr;
++    t_topology        top;
 +    rvec              *x, *v;
-     real              *pot, vol, qtot;
++    real               vol, qtot;
 +    matrix             box;
 +    t_atoms            atoms;
 +    t_pbc              pbc;
-     int               *repl;
++    int               *repl, ePBC;
 +    atom_id           *index;
-     char              *grpname;
-     gmx_bool          *bSet, bPDB;
++    char              *grpname, title[STRLEN];
++    gmx_bool          *bSet;
 +    int                i, nw, nwa, nsa, nsalt, iqtot;
-     FILE              *fplog;
 +    output_env_t       oenv;
 +    t_filenm           fnm[] = {
 +        { efTPX, NULL,  NULL,      ffREAD  },
-         { efXVG, "-table", "table", ffOPTRD },
 +        { efNDX, NULL,  NULL,      ffOPTRD },
 +        { efSTO, "-o",  NULL,      ffWRITE },
-         { efLOG, "-g",  "genion",  ffWRITE },
-         { efPDB, "-pot", "pot",    ffOPTWR },
 +        { efTOP, "-p",  "topol",   ffOPTRW }
 +    };
 +#define NFILE asize(fnm)
 +
 +    parse_common_args(&argc, argv, PCA_BE_NICE, NFILE, fnm, asize(pa), pa,
 +                      asize(desc), desc, asize(bugs), bugs, &oenv);
-     bPDB = ftp2bSet(efPDB, NFILE, fnm);
-     if (bRandom && bPDB)
-     {
-         fprintf(stderr, "Not computing potential with random option!\n");
-         bPDB = FALSE;
-     }
 +
 +    /* Check input for something sensible */
 +    if ((p_num < 0) || (n_num < 0))
 +    {
 +        gmx_fatal(FARGS, "Negative number of ions to add?");
 +    }
 +
-     snew(mtop, 1);
-     snew(top, 1);
-     fplog = init_calcpot(ftp2fn(efLOG, NFILE, fnm), ftp2fn(efTPX, NFILE, fnm),
-                          opt2fn("-table", NFILE, fnm), mtop, top, &inputrec, &cr,
-                          &graph, &mdatoms, &fr, &enerd, &pot, box, &x, oenv);
++    if (conc > 0 && (p_num > 0 || n_num > 0))
++    {
++        fprintf(stderr, "WARNING: -conc specified, overriding -nn and -np.\n");
++    }
 +
-     atoms = gmx_mtop_global_atoms(mtop);
++    /* Read atom positions and charges */
++    read_tps_conf(ftp2fn(efTPX, NFILE, fnm), title, &top, &ePBC, &x, &v, box, FALSE);
++    atoms = top.atoms;
 +
++    /* Compute total charge */
 +    qtot = 0;
 +    for (i = 0; (i < atoms.nr); i++)
 +    {
 +        qtot += atoms.atom[i].q;
 +    }
 +    iqtot = gmx_nint(qtot);
 +
 +    
 +    if (conc > 0)
 +    {
 +        /* Compute number of ions to be added */
 +        vol = det(box);
 +        nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
 +        p_num = abs(nsalt*n_q);
 +        n_num = abs(nsalt*p_q);
 +    }
 +    if (bNeutral)
 +    {
 +        int qdelta = p_num*p_q + n_num*n_q + iqtot;
 +
 +        /* Check if the system is neutralizable
 +         * is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */
 +        int gcd = greatest_common_divisor(n_q, p_q);
 +        if ((qdelta % gcd) != 0)
 +        {
 +            gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and"
 +                    " -pq %d.\n", n_q, p_q);
 +        }
 +        
 +        while (qdelta != 0)
 +        {
 +            while (qdelta < 0)
 +            {
 +                p_num++;
 +                qdelta += p_q;
 +            }
 +            while (qdelta > 0)
 +            {
 +                n_num++;
 +                qdelta += n_q;
 +            }
 +        }
 +    }
 +
 +    if ((p_num == 0) && (n_num == 0))
 +    {
-         if (!bPDB)
-         {
-             fprintf(stderr, "No ions to add and no potential to calculate.\n");
-             exit(0);
-         }
-         nw  = 0;
-         nsa = 0; /* to keep gcc happy */
++        fprintf(stderr, "No ions to add.\n");
++        exit(0);
 +    }
 +    else
 +    {
 +        printf("Will try to add %d %s ions and %d %s ions.\n",
 +               p_num, p_name, n_num, n_name);
 +        printf("Select a continuous group of solvent molecules\n");
 +        get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &nwa, &index, &grpname);
 +        for (i = 1; i < nwa; i++)
 +        {
 +            if (index[i] != index[i-1]+1)
 +            {
 +                gmx_fatal(FARGS, "The solvent group %s is not continuous: "
 +                          "index[%d]=%d, index[%d]=%d",
 +                          grpname, i, index[i-1]+1, i+1, index[i]+1);
 +            }
 +        }
 +        nsa = 1;
 +        while ((nsa < nwa) &&
 +               (atoms.atom[index[nsa]].resind ==
 +                atoms.atom[index[nsa-1]].resind))
 +        {
 +            nsa++;
 +        }
 +        if (nwa % nsa)
 +        {
 +            gmx_fatal(FARGS, "Your solvent group size (%d) is not a multiple of %d",
 +                      nwa, nsa);
 +        }
 +        nw = nwa/nsa;
 +        fprintf(stderr, "Number of (%d-atomic) solvent molecules: %d\n", nsa, nw);
 +        if (p_num+n_num > nw)
 +        {
 +            gmx_fatal(FARGS, "Not enough solvent for adding ions");
 +        }
 +    }
 +
 +    if (opt2bSet("-p", NFILE, fnm))
 +    {
 +        update_topol(opt2fn("-p", NFILE, fnm), p_num, n_num, p_name, n_name, grpname);
 +    }
 +
 +    snew(bSet, nw);
 +    snew(repl, nw);
 +
 +    snew(v, atoms.nr);
 +    snew(atoms.pdbinfo, atoms.nr);
 +
-     set_pbc(&pbc, inputrec.ePBC, box);
++    set_pbc(&pbc, ePBC, box);
 +
 +    /* Now loop over the ions that have to be placed */
-     do
++    while (p_num-- > 0)
 +    {
-         if (!bRandom)
-         {
-             calc_pot(fplog, cr, mtop, &inputrec, top, x, fr, &enerd, mdatoms, pot, box, graph);
-             if (bPDB || debug)
-             {
-                 char buf[STRLEN];
- 
-                 if (debug)
-                 {
-                     sprintf(buf, "%d_%s", p_num+n_num, ftp2fn(efPDB, NFILE, fnm));
-                 }
-                 else
-                 {
-                     strcpy(buf, ftp2fn(efPDB, NFILE, fnm));
-                 }
-                 for (i = 0; (i < atoms.nr); i++)
-                 {
-                     atoms.pdbinfo[i].bfac = pot[i]*scale;
-                 }
-                 write_sto_conf(buf, "Potential calculated by genion",
-                                &atoms, x, v, inputrec.ePBC, box);
-                 bPDB = FALSE;
-             }
-         }
-         if ((p_num > 0) && (p_num >= n_num))
-         {
-             insert_ion(nsa, &nw, bSet, repl, index, pot, x, &pbc,
-                        1, p_q, p_name, mdatoms, rmin, bRandom, &seed);
-             p_num--;
-         }
-         else if (n_num > 0)
-         {
-             insert_ion(nsa, &nw, bSet, repl, index, pot, x, &pbc,
-                        -1, n_q, n_name, mdatoms, rmin, bRandom, &seed);
-             n_num--;
-         }
++        insert_ion(nsa, &nw, bSet, repl, index, x, &pbc,
++                   1, p_q, p_name, &atoms, rmin, &seed);
++    }
++    while (n_num-- > 0)
++    {
++        insert_ion(nsa, &nw, bSet, repl, index, x, &pbc,
++                   -1, n_q, n_name, &atoms, rmin, &seed);
 +    }
-     while (p_num+n_num > 0);
 +    fprintf(stderr, "\n");
 +
 +    if (nw)
 +    {
 +        sort_ions(nsa, nw, repl, index, &atoms, x, p_name, n_name);
 +    }
 +
 +    sfree(atoms.pdbinfo);
 +    atoms.pdbinfo = NULL;
-     write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *mtop->name, &atoms, x, NULL,
-                    inputrec.ePBC, box);
++    write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *top.name, &atoms, x, NULL, ePBC,
++                   box);
 +
 +    thanx(stderr);
 +
-     gmx_log_close(fplog);
- 
 +    return 0;
 +}
diff --cc src/gromacs/gmxlib/copyrite.c
index 7c64b5cdc9,0000000000..fef70c2bc4
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/copyrite.c
+++ b/src/gromacs/gmxlib/copyrite.c
@@@ -1,702 -1,0 +1,708 @@@
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#include "copyrite.h"
 +
 +#ifdef HAVE_CONFIG_H
 +#include "config.h"
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <time.h>
 +
 +#ifdef HAVE_LIBMKL
 +#include <mkl.h>
 +#endif
 +
 +/* This file is completely threadsafe - keep it that way! */
 +
 +#include "gromacs/legacyheaders/futil.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/random.h"
 +#include "gromacs/legacyheaders/smalloc.h"
 +#include "gromacs/legacyheaders/statutil.h"
 +#include "gromacs/legacyheaders/strdb.h"
 +#include "gromacs/legacyheaders/string2.h"
 +#include "gromacs/legacyheaders/vec.h"
 +
 +#include "gromacs/fft/fft.h"
 +
 +#include "buildinfo.h"
 +
 +static gmx_bool be_cool(void)
 +{
 +    /* Yes, it is bad to check the environment variable every call,
 +     * but we dont call this routine often, and it avoids using
 +     * a mutex for locking the variable...
 +     */
 +#ifdef GMX_COOL_QUOTES
 +    return (getenv("GMX_NO_QUOTES") == NULL);
 +#else
 +    /*be uncool*/
 +    return FALSE;
 +#endif
 +}
 +
 +static void space(FILE *out, int n)
 +{
 +    fprintf(out, "%*s", n, "");
 +}
 +
 +static void sp_print(FILE *out, const char *s)
 +{
 +    int slen;
 +
 +    slen = strlen(s);
 +    space(out, (80-slen)/2);
 +    fprintf(out, "%s\n", s);
 +}
 +
 +static void ster_print(FILE *out, const char *s)
 +{
 +    int  slen;
 +    char buf[128];
 +
 +    snprintf(buf, 128, ":-)  %s  (-:", s);
 +    slen = strlen(buf);
 +    space(out, (80-slen)/2);
 +    fprintf(out, "%s\n", buf);
 +}
 +
 +
 +static void pukeit(const char *db, const char *defstring, char *retstring,
 +                   int retsize, int *cqnum)
 +{
 +    FILE  *fp;
 +    char **help;
 +    int    i, nhlp;
 +    int    seed;
 +
 +    if (be_cool() && ((fp = low_libopen(db, FALSE)) != NULL))
 +    {
 +        nhlp = fget_lines(fp, &help);
 +        /* for libraries we can use the low-level close routines */
 +        ffclose(fp);
 +        seed   = time(NULL);
 +        *cqnum = nhlp*rando(&seed);
 +        if (strlen(help[*cqnum]) >= STRLEN)
 +        {
 +            help[*cqnum][STRLEN-1] = '\0';
 +        }
 +        strncpy(retstring, help[*cqnum], retsize);
 +        for (i = 0; (i < nhlp); i++)
 +        {
 +            sfree(help[i]);
 +        }
 +        sfree(help);
 +    }
 +    else
 +    {
 +        strncpy(retstring, defstring, retsize);
 +    }
 +}
 +
 +void bromacs(char *retstring, int retsize)
 +{
 +    int dum;
 +
 +    pukeit("bromacs.dat",
 +           "Groningen Machine for Chemical Simulation",
 +           retstring, retsize, &dum);
 +}
 +
 +void cool_quote(char *retstring, int retsize, int *cqnum)
 +{
 +    char *tmpstr;
 +    char *s, *ptr;
 +    int   tmpcq, *p;
 +
 +    if (cqnum != NULL)
 +    {
 +        p = cqnum;
 +    }
 +    else
 +    {
 +        p = &tmpcq;
 +    }
 +
 +    /* protect audience from explicit lyrics */
 +    snew(tmpstr, retsize+1);
 +    pukeit("gurgle.dat", "Thanx for Using GROMACS - Have a Nice Day",
 +           tmpstr, retsize-2, p);
 +
 +    if ((ptr = strchr(tmpstr, '_')) != NULL)
 +    {
 +        *ptr = '\0';
 +        ptr++;
 +        sprintf(retstring, "\"%s\" %s", tmpstr, ptr);
 +    }
 +    else
 +    {
 +        strcpy(retstring, tmpstr);
 +    }
 +    sfree(tmpstr);
 +}
 +
 +void CopyRight(FILE *out, const char *szProgram)
 +{
 +    static const char * CopyrightText[] = {
 +        "Written by Emile Apol, Rossen Apostolov, Herman J.C. Berendsen,",
 +        "Aldert van Buuren, PÃ¤r Bjelkmar, Rudi van Drunen, Anton Feenstra, ",
 +        "Gerrit Groenhof, Peter Kasson, Per Larsson, Pieter Meulenhoff, ",
 +        "Teemu Murtola, Szilard Pall, Sander Pronk, Roland Schulz, ",
 +        "Michael Shirts, Alfons Sijbers, Peter Tieleman,\n",
 +        "Berk Hess, David van der Spoel, and Erik Lindahl.\n",
 +        "Copyright (c) 1991-2000, University of Groningen, The Netherlands.",
 +        "Copyright (c) 2001-2010, The GROMACS development team at",
 +        "Uppsala University & The Royal Institute of Technology, Sweden.",
 +        "check out http://www.gromacs.org for more information.\n"
 +    };
 +
 +    static const char * LicenseText[] = {
 +        "This program is free software; you can redistribute it and/or",
 +        "modify it under the terms of the GNU Lesser General Public License",
 +        "as published by the Free Software Foundation; either version 2.1",
 +        "of the License, or (at your option) any later version."
 +    };
 +
 +    /* Dont change szProgram arbitrarily - it must be argv[0], i.e. the
 +     * name of a file. Otherwise, we won't be able to find the library dir.
 +     */
++
 +#define NCR (int)asize(CopyrightText)
 +/* TODO: Is this exception still needed? */
 +#ifdef GMX_FAHCORE
 +#define NLICENSE 0 /*FAH has an exception permission from LGPL to allow digital signatures in Gromacs*/
 +#else
 +#define NLICENSE (int)asize(LicenseText)
 +#endif
 +
 +    char buf[256], tmpstr[1024];
 +    int  i;
 +
 +#ifdef GMX_FAHCORE
 +    set_program_name("Gromacs");
 +#else
 +    set_program_name(szProgram);
 +#endif
 +
 +    ster_print(out, "G  R  O  M  A  C  S");
 +    fprintf(out, "\n");
 +
 +    bromacs(tmpstr, 1023);
 +    sp_print(out, tmpstr);
 +    fprintf(out, "\n");
 +
 +    ster_print(out, GromacsVersion());
 +    fprintf(out, "\n");
 +
++    if (getenv("GMX_NO_CREDITS"))
++    {
++        return;
++    }
++
 +    /* fprintf(out,"\n");*/
 +
 +    /* sp_print(out,"PLEASE NOTE: THIS IS A BETA VERSION\n");
 +
 +       fprintf(out,"\n"); */
 +
 +    for (i = 0; (i < NCR); i++)
 +    {
 +        sp_print(out, CopyrightText[i]);
 +    }
 +    for (i = 0; (i < NLICENSE); i++)
 +    {
 +        sp_print(out, LicenseText[i]);
 +    }
 +
 +    fprintf(out, "\n");
 +
 +    snprintf(buf, 256, "%s", Program());
 +#ifdef GMX_DOUBLE
 +    strcat(buf, " (double precision)");
 +#endif
 +    ster_print(out, buf);
 +    fprintf(out, "\n");
 +}
 +
 +
 +void thanx(FILE *fp)
 +{
 +    char cq[1024];
 +    int  cqnum;
 +
 +    /* protect the audience from suggestive discussions */
 +    cool_quote(cq, 1023, &cqnum);
 +
 +    if (be_cool())
 +    {
 +        fprintf(fp, "\ngcq#%d: %s\n\n", cqnum, cq);
 +    }
 +    else
 +    {
 +        fprintf(fp, "\n%s\n\n", cq);
 +    }
 +}
 +
 +typedef struct {
 +    const char *key;
 +    const char *author;
 +    const char *title;
 +    const char *journal;
 +    int         volume, year;
 +    const char *pages;
 +} t_citerec;
 +
 +void please_cite(FILE *fp, const char *key)
 +{
 +    static const t_citerec citedb[] = {
 +        { "Allen1987a",
 +          "M. P. Allen and D. J. Tildesley",
 +          "Computer simulation of liquids",
 +          "Oxford Science Publications",
 +          1, 1987, "1" },
 +        { "Berendsen95a",
 +          "H. J. C. Berendsen, D. van der Spoel and R. van Drunen",
 +          "GROMACS: A message-passing parallel molecular dynamics implementation",
 +          "Comp. Phys. Comm.",
 +          91, 1995, "43-56" },
 +        { "Berendsen84a",
 +          "H. J. C. Berendsen, J. P. M. Postma, A. DiNola and J. R. Haak",
 +          "Molecular dynamics with coupling to an external bath",
 +          "J. Chem. Phys.",
 +          81, 1984, "3684-3690" },
 +        { "Ryckaert77a",
 +          "J. P. Ryckaert and G. Ciccotti and H. J. C. Berendsen",
 +          "Numerical Integration of the Cartesian Equations of Motion of a System with Constraints; Molecular Dynamics of n-Alkanes",
 +          "J. Comp. Phys.",
 +          23, 1977, "327-341" },
 +        { "Miyamoto92a",
 +          "S. Miyamoto and P. A. Kollman",
 +          "SETTLE: An Analytical Version of the SHAKE and RATTLE Algorithms for Rigid Water Models",
 +          "J. Comp. Chem.",
 +          13, 1992, "952-962" },
 +        { "Cromer1968a",
 +          "D. T. Cromer & J. B. Mann",
 +          "X-ray scattering factors computed from numerical Hartree-Fock wave functions",
 +          "Acta Cryst. A",
 +          24, 1968, "321" },
 +        { "Barth95a",
 +          "E. Barth and K. Kuczera and B. Leimkuhler and R. D. Skeel",
 +          "Algorithms for Constrained Molecular Dynamics",
 +          "J. Comp. Chem.",
 +          16, 1995, "1192-1209" },
 +        { "Essmann95a",
 +          "U. Essmann, L. Perera, M. L. Berkowitz, T. Darden, H. Lee and L. G. Pedersen ",
 +          "A smooth particle mesh Ewald method",
 +          "J. Chem. Phys.",
 +          103, 1995, "8577-8592" },
 +        { "Torda89a",
 +          "A. E. Torda and R. M. Scheek and W. F. van Gunsteren",
 +          "Time-dependent distance restraints in molecular dynamics simulations",
 +          "Chem. Phys. Lett.",
 +          157, 1989, "289-294" },
 +        { "Tironi95a",
 +          "I. G. Tironi and R. Sperb and P. E. Smith and W. F. van Gunsteren",
 +          "Generalized reaction field method for molecular dynamics simulations",
 +          "J. Chem. Phys",
 +          102, 1995, "5451-5459" },
 +        { "Hess97a",
 +          "B. Hess and H. Bekker and H. J. C. Berendsen and J. G. E. M. Fraaije",
 +          "LINCS: A Linear Constraint Solver for molecular simulations",
 +          "J. Comp. Chem.",
 +          18, 1997, "1463-1472" },
 +        { "Hess2008a",
 +          "B. Hess",
 +          "P-LINCS: A Parallel Linear Constraint Solver for molecular simulation",
 +          "J. Chem. Theory Comput.",
 +          4, 2008, "116-122" },
 +        { "Hess2008b",
 +          "B. Hess and C. Kutzner and D. van der Spoel and E. Lindahl",
 +          "GROMACS 4: Algorithms for highly efficient, load-balanced, and scalable molecular simulation",
 +          "J. Chem. Theory Comput.",
 +          4, 2008, "435-447" },
 +        { "Hub2010",
 +          "J. S. Hub, B. L. de Groot and D. van der Spoel",
 +          "g_wham - A free weighted histogram analysis implementation including robust error and autocorrelation estimates",
 +          "J. Chem. Theory Comput.",
 +          6, 2010, "3713-3720"},
 +        { "In-Chul99a",
 +          "Y. In-Chul and M. L. Berkowitz",
 +          "Ewald summation for systems with slab geometry",
 +          "J. Chem. Phys.",
 +          111, 1999, "3155-3162" },
 +        { "DeGroot97a",
 +          "B. L. de Groot and D. M. F. van Aalten and R. M. Scheek and A. Amadei and G. Vriend and H. J. C. Berendsen",
 +          "Prediction of Protein Conformational Freedom From Distance Constrains",
 +          "Proteins",
 +          29, 1997, "240-251" },
 +        { "Spoel98a",
 +          "D. van der Spoel and P. J. van Maaren and H. J. C. Berendsen",
 +          "A systematic study of water models for molecular simulation. Derivation of models optimized for use with a reaction-field.",
 +          "J. Chem. Phys.",
 +          108, 1998, "10220-10230" },
 +        { "Wishart98a",
 +          "D. S. Wishart and A. M. Nip",
 +          "Protein Chemical Shift Analysis: A Practical Guide",
 +          "Biochem. Cell Biol.",
 +          76, 1998, "153-163" },
 +        { "Maiorov95",
 +          "V. N. Maiorov and G. M. Crippen",
 +          "Size-Independent Comparison of Protein Three-Dimensional Structures",
 +          "PROTEINS: Struct. Funct. Gen.",
 +          22, 1995, "273-283" },
 +        { "Feenstra99",
 +          "K. A. Feenstra and B. Hess and H. J. C. Berendsen",
 +          "Improving Efficiency of Large Time-scale Molecular Dynamics Simulations of Hydrogen-rich Systems",
 +          "J. Comput. Chem.",
 +          20, 1999, "786-798" },
 +        { "Timneanu2004a",
 +          "N. Timneanu and C. Caleman and J. Hajdu and D. van der Spoel",
 +          "Auger Electron Cascades in Water and Ice",
 +          "Chem. Phys.",
 +          299, 2004, "277-283" },
 +        { "Pascal2011a",
 +          "T. A. Pascal and S. T. Lin and W. A. Goddard III",
 +          "Thermodynamics of liquids: standard molar entropies and heat capacities of common solvents from 2PT molecular dynamics",
 +          "Phys. Chem. Chem. Phys.",
 +          13, 2011, "169-181" },
 +        { "Caleman2011b",
 +          "C. Caleman and P. J. van Maaren and M. Hong and J. S. Hub and L. T. da Costa and D. van der Spoel",
 +          "Force Field Benchmark of Organic Liquids: Density, Enthalpy of Vaporization, Heat Capacities, Surface Tension, Isothermal Compressibility, Volumetric Expansion Coefficient, and Dielectric Constant",
 +          "J. Chem. Theo. Comp.",
 +          8, 2012, "61" },
 +        { "Lindahl2001a",
 +          "E. Lindahl and B. Hess and D. van der Spoel",
 +          "GROMACS 3.0: A package for molecular simulation and trajectory analysis",
 +          "J. Mol. Mod.",
 +          7, 2001, "306-317" },
 +        { "Wang2001a",
 +          "J. Wang and W. Wang and S. Huo and M. Lee and P. A. Kollman",
 +          "Solvation model based on weighted solvent accessible surface area",
 +          "J. Phys. Chem. B",
 +          105, 2001, "5055-5067" },
 +        { "Eisenberg86a",
 +          "D. Eisenberg and A. D. McLachlan",
 +          "Solvation energy in protein folding and binding",
 +          "Nature",
 +          319, 1986, "199-203" },
 +        { "Bondi1964a",
 +          "A. Bondi",
 +          "van der Waals Volumes and Radii",
 +          "J. Phys. Chem.",
 +          68, 1964, "441-451" },
 +        { "Eisenhaber95",
 +          "Frank Eisenhaber and Philip Lijnzaad and Patrick Argos and Chris Sander and Michael Scharf",
 +          "The Double Cube Lattice Method: Efficient Approaches to Numerical Integration of Surface Area and Volume and to Dot Surface Contouring of Molecular Assemblies",
 +          "J. Comp. Chem.",
 +          16, 1995, "273-284" },
 +        { "Hess2002",
 +          "B. Hess, H. Saint-Martin and H.J.C. Berendsen",
 +          "Flexible constraints: an adiabatic treatment of quantum degrees of freedom, with application to the flexible and polarizable MCDHO model for water",
 +          "J. Chem. Phys.",
 +          116, 2002, "9602-9610" },
 +        { "Hetenyi2002b",
 +          "Csaba Hetenyi and David van der Spoel",
 +          "Efficient docking of peptides to proteins without prior knowledge of the binding site.",
 +          "Prot. Sci.",
 +          11, 2002, "1729-1737" },
 +        { "Hess2003",
 +          "B. Hess and R.M. Scheek",
 +          "Orientation restraints in molecular dynamics simulations using time and ensemble averaging",
 +          "J. Magn. Res.",
 +          164, 2003, "19-27" },
 +        { "Rappe1991a",
 +          "A. K. Rappe and W. A. Goddard III",
 +          "Charge Equillibration for Molecular Dynamics Simulations",
 +          "J. Phys. Chem.",
 +          95, 1991, "3358-3363" },
 +        { "Mu2005a",
 +          "Y. Mu, P. H. Nguyen and G. Stock",
 +          "Energy landscape of a small peptide revelaed by dihedral angle principal component analysis",
 +          "Prot. Struct. Funct. Bioinf.",
 +          58, 2005, "45-52" },
 +        { "Okabe2001a",
 +          "T. Okabe and M. Kawata and Y. Okamoto and M. Mikami",
 +          "Replica-exchange {M}onte {C}arlo method for the isobaric-isothermal ensemble",
 +          "Chem. Phys. Lett.",
 +          335, 2001, "435-439" },
 +        { "Hukushima96a",
 +          "K. Hukushima and K. Nemoto",
 +          "Exchange Monte Carlo Method and Application to Spin Glass Simulations",
 +          "J. Phys. Soc. Jpn.",
 +          65, 1996, "1604-1608" },
 +        { "Tropp80a",
 +          "J. Tropp",
 +          "Dipolar Relaxation and Nuclear Overhauser effects in nonrigid molecules: The effect of fluctuating internuclear distances",
 +          "J. Chem. Phys.",
 +          72, 1980, "6035-6043" },
 +        { "Bultinck2002a",
 +          "P. Bultinck and W. Langenaeker and P. Lahorte and F. De Proft and P. Geerlings and M. Waroquier and J. P. Tollenaere",
 +          "The electronegativity equalization method I: Parametrization and validation for atomic charge calculations",
 +          "J. Phys. Chem. A",
 +          106, 2002, "7887-7894" },
 +        { "Yang2006b",
 +          "Q. Y. Yang and K. A. Sharp",
 +          "Atomic charge parameters for the finite difference Poisson-Boltzmann method using electronegativity neutralization",
 +          "J. Chem. Theory Comput.",
 +          2, 2006, "1152-1167" },
 +        { "Spoel2005a",
 +          "D. van der Spoel, E. Lindahl, B. Hess, G. Groenhof, A. E. Mark and H. J. C. Berendsen",
 +          "GROMACS: Fast, Flexible and Free",
 +          "J. Comp. Chem.",
 +          26, 2005, "1701-1719" },
 +        { "Spoel2006b",
 +          "D. van der Spoel, P. J. van Maaren, P. Larsson and N. Timneanu",
 +          "Thermodynamics of hydrogen bonding in hydrophilic and hydrophobic media",
 +          "J. Phys. Chem. B",
 +          110, 2006, "4393-4398" },
 +        { "Spoel2006d",
 +          "D. van der Spoel and M. M. Seibert",
 +          "Protein folding kinetics and thermodynamics from atomistic simulations",
 +          "Phys. Rev. Letters",
 +          96, 2006, "238102" },
 +        { "Palmer94a",
 +          "B. J. Palmer",
 +          "Transverse-current autocorrelation-function calculations of the shear viscosity for molecular liquids",
 +          "Phys. Rev. E",
 +          49, 1994, "359-366" },
 +        { "Bussi2007a",
 +          "G. Bussi, D. Donadio and M. Parrinello",
 +          "Canonical sampling through velocity rescaling",
 +          "J. Chem. Phys.",
 +          126, 2007, "014101" },
 +        { "Hub2006",
 +          "J. S. Hub and B. L. de Groot",
 +          "Does CO2 permeate through Aquaporin-1?",
 +          "Biophys. J.",
 +          91, 2006, "842-848" },
 +        { "Hub2008",
 +          "J. S. Hub and B. L. de Groot",
 +          "Mechanism of selectivity in aquaporins and aquaglyceroporins",
 +          "PNAS",
 +          105, 2008, "1198-1203" },
 +        { "Friedrich2009",
 +          "M. S. Friedrichs, P. Eastman, V. Vaidyanathan, M. Houston, S. LeGrand, A. L. Beberg, D. L. Ensign, C. M. Bruns, and V. S. Pande",
 +          "Accelerating Molecular Dynamic Simulation on Graphics Processing Units",
 +          "J. Comp. Chem.",
 +          30, 2009, "864-872" },
 +        { "Engin2010",
 +          "O. Engin, A. Villa, M. Sayar and B. Hess",
 +          "Driving Forces for Adsorption of Amphiphilic Peptides to Air-Water Interface",
 +          "J. Phys. Chem. B",
 +          114, 2010, "11093" },
 +        { "Fritsch12",
 +          "S. Fritsch, C. Junghans and K. Kremer",
 +          "Adaptive molecular simulation study on structure formation of toluene around C60 using Gromacs",
 +          "J. Chem. Theo. Comp.",
 +          8, 2012, "398" },
 +        { "Junghans10",
 +          "C. Junghans and S. Poblete",
 +          "A reference implementation of the adaptive resolution scheme in ESPResSo",
 +          "Comp. Phys. Comm.",
 +          181, 2010, "1449" },
 +        { "Wang2010",
 +          "H. Wang, F. Dommert, C.Holm",
 +          "Optimizing working parameters of the smooth particle mesh Ewald algorithm in terms of accuracy and efficiency",
 +          "J. Chem. Phys. B",
 +          133, 2010, "034117" },
 +        { "Sugita1999a",
 +          "Y. Sugita, Y. Okamoto",
 +          "Replica-exchange molecular dynamics method for protein folding",
 +          "Chem. Phys. Lett.",
 +          314, 1999, "141-151" },
 +        { "Kutzner2011",
 +          "C. Kutzner and J. Czub and H. Grubmuller",
 +          "Keep it Flexible: Driving Macromolecular Rotary Motions in Atomistic Simulations with GROMACS",
 +          "J. Chem. Theory Comput.",
 +          7, 2011, "1381-1393" },
 +        { "Hoefling2011",
 +          "M. Hoefling, N. Lima, D. Haenni, C.A.M. Seidel, B. Schuler, H. Grubmuller",
 +          "Structural Heterogeneity and Quantitative FRET Efficiency Distributions of Polyprolines through a Hybrid Atomistic Simulation and Monte Carlo Approach",
 +          "PLoS ONE",
 +          6, 2011, "e19791" },
 +        { "Hockney1988",
 +          "R. W. Hockney and J. W. Eastwood",
 +          "Computer simulation using particles",
 +          "IOP, Bristol",
 +          1, 1988, "1" },
 +        { "Ballenegger2012",
 +          "V. Ballenegger, J.J. Cerda, and C. Holm",
 +          "How to Convert SPME to P3M: Influence Functions and Error Estimates",
 +          "J. Chem. Theory Comput.",
 +          8, 2012, "936-947" },
 +        { "Garmay2012",
 +          "Garmay Yu, Shvetsov A, Karelov D, Lebedev D, Radulescu A, Petukhov M, Isaev-Ivanov V",
 +          "Correlated motion of protein subdomains and large-scale conformational flexibility of RecA protein filament",
 +          "Journal of Physics: Conference Series",
 +          340, 2012, "012094" }
 +    };
 +#define NSTR (int)asize(citedb)
 +
 +    int   j, index;
 +    char *author;
 +    char *title;
 +#define LINE_WIDTH 79
 +
 +    if (fp == NULL)
 +    {
 +        return;
 +    }
 +
 +    for (index = 0; (index < NSTR) && (strcmp(citedb[index].key, key) != 0); index++)
 +    {
 +        ;
 +    }
 +
 +    fprintf(fp, "\n++++ PLEASE READ AND CITE THE FOLLOWING REFERENCE ++++\n");
 +    if (index < NSTR)
 +    {
 +        /* Insert newlines */
 +        author = wrap_lines(citedb[index].author, LINE_WIDTH, 0, FALSE);
 +        title  = wrap_lines(citedb[index].title, LINE_WIDTH, 0, FALSE);
 +        fprintf(fp, "%s\n%s\n%s %d (%d) pp. %s\n",
 +                author, title, citedb[index].journal,
 +                citedb[index].volume, citedb[index].year,
 +                citedb[index].pages);
 +        sfree(author);
 +        sfree(title);
 +    }
 +    else
 +    {
 +        fprintf(fp, "Entry %s not found in citation database\n", key);
 +    }
 +    fprintf(fp, "-------- -------- --- Thank You --- -------- --------\n\n");
 +    fflush(fp);
 +}
 +
 +#ifdef GMX_GIT_VERSION_INFO
 +/* Version information generated at compile time. */
 +#include "gromacs/utility/gitversion.h"
 +#else
 +/* Fall back to statically defined version. */
 +static const char _gmx_ver_string[] = "VERSION " VERSION;
 +#endif
 +
 +const char *GromacsVersion()
 +{
 +    return _gmx_ver_string;
 +}
 +
 +void gmx_print_version_info_gpu(FILE *fp);
 +
 +void gmx_print_version_info(FILE *fp)
 +{
 +    fprintf(fp, "Gromacs version:    %s\n", _gmx_ver_string);
 +#ifdef GMX_GIT_VERSION_INFO
 +    fprintf(fp, "GIT SHA1 hash:      %s\n", _gmx_full_git_hash);
 +    /* Only print out the branch information if present.
 +     * The generating script checks whether the branch point actually
 +     * coincides with the hash reported above, and produces an empty string
 +     * in such cases. */
 +    if (_gmx_central_base_hash[0] != 0)
 +    {
 +        fprintf(fp, "Branched from:      %s\n", _gmx_central_base_hash);
 +    }
 +#endif
 +
 +#ifdef GMX_DOUBLE
 +    fprintf(fp, "Precision:          double\n");
 +#else
 +    fprintf(fp, "Precision:          single\n");
 +#endif
 +    fprintf(fp, "Memory model:       %lu bit\n", 8*sizeof(void *));
 +
 +#ifdef GMX_THREAD_MPI
 +    fprintf(fp, "MPI library:        thread_mpi\n");
 +#elif defined(GMX_MPI)
 +    fprintf(fp, "MPI library:        MPI\n");
 +#else
 +    fprintf(fp, "MPI library:        none\n");
 +#endif
 +#ifdef GMX_OPENMP
 +    fprintf(fp, "OpenMP support:     enabled\n");
 +#else
 +    fprintf(fp, "OpenMP support:     disabled\n");
 +#endif
 +#ifdef GMX_GPU
 +    fprintf(fp, "GPU support:        enabled\n");
 +#else
 +    fprintf(fp, "GPU support:        disabled\n");
 +#endif
 +    /* A preprocessor trick to avoid duplicating logic from vec.h */
 +#define gmx_stringify2(x) #x
 +#define gmx_stringify(x) gmx_stringify2(x)
 +    fprintf(fp, "invsqrt routine:    %s\n", gmx_stringify(gmx_invsqrt(x)));
 +    fprintf(fp, "CPU acceleration:   %s\n", GMX_CPU_ACCELERATION_STRING);
 +
 +    fprintf(fp, "FFT library:        %s\n", gmx_fft_get_version_info());
 +#ifdef GMX_LARGEFILES
 +    fprintf(fp, "Large file support: enabled\n");
 +#else
 +    fprintf(fp, "Large file support: disabled\n");
 +#endif
 +#ifdef HAVE_RDTSCP
 +    fprintf(fp, "RDTSCP usage:       enabled\n");
 +#else
 +    fprintf(fp, "RDTSCP usage:       disabled\n");
 +#endif
 +
 +    fprintf(fp, "Built on:           %s\n", BUILD_TIME);
 +    fprintf(fp, "Built by:           %s\n", BUILD_USER);
 +    fprintf(fp, "Build OS/arch:      %s\n", BUILD_HOST);
 +    fprintf(fp, "Build CPU vendor:   %s\n", BUILD_CPU_VENDOR);
 +    fprintf(fp, "Build CPU brand:    %s\n", BUILD_CPU_BRAND);
 +    fprintf(fp, "Build CPU family:   %d   Model: %d   Stepping: %d\n",
 +            BUILD_CPU_FAMILY, BUILD_CPU_MODEL, BUILD_CPU_STEPPING);
 +    /* TODO: The below strings can be quite long, so it would be nice to wrap
 +     * them. Can wait for later, as the master branch has ready code to do all
 +     * that. */
 +    fprintf(fp, "Build CPU features: %s\n", BUILD_CPU_FEATURES);
 +    fprintf(fp, "C compiler:         %s\n", BUILD_C_COMPILER);
 +    fprintf(fp, "C compiler flags:   %s\n", BUILD_CFLAGS);
 +    fprintf(fp, "C++ compiler:       %s\n", BUILD_CXX_COMPILER);
 +    fprintf(fp, "C++ compiler flags: %s\n", BUILD_CXXFLAGS);
 +#ifdef HAVE_LIBMKL
 +    /* MKL might be used for LAPACK/BLAS even if FFTs use FFTW, so keep it separate */
 +    fprintf(fp, "Linked with Intel MKL version %d.%d.%d.\n",
 +            __INTEL_MKL__, __INTEL_MKL_MINOR__, __INTEL_MKL_UPDATE__);
 +#endif
 +#ifdef GMX_GPU
 +    gmx_print_version_info_gpu(fp);
 +#endif
 +
 +}
diff --cc src/gromacs/gmxlib/gmx_cpuid.c
index 93a2e27181,0000000000..c12ce798a4
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@@ -1,1172 -1,0 +1,1186 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef HAVE_SCHED_H
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <ctype.h>
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +/* sysinfo functions */
 +#include <windows.h>
 +#endif
 +#ifdef HAVE_UNISTD_H
 +/* sysconf() definition */
 +#include <unistd.h>
 +#endif
 +
 +#include "gmx_cpuid.h"
 +
 +
 +
 +/* For convenience, and to enable configure-time invocation, we keep all architectures
 + * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
 + */
 +#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
 +/* OK, it is x86, but can we execute cpuid? */
 +#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
 +#    define GMX_CPUID_X86
 +#endif
 +#endif
 +
 +/* Global constant character strings corresponding to our enumerated types */
 +const char *
 +gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
 +{
 +    "CannotDetect",
 +    "Unknown",
 +    "GenuineIntel",
 +    "AuthenticAMD",
 +    "Fujitsu",
 +    "IBM"
 +};
 +
 +const char *
 +gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
 +{
 +    "CannotDetect",
 +    "aes",
 +    "apic",
 +    "avx",
 +    "avx2",
 +    "clfsh",
 +    "cmov",
 +    "cx8",
 +    "cx16",
 +    "f16c",
 +    "fma",
 +    "fma4",
 +    "htt",
 +    "lahf_lm",
 +    "misalignsse",
 +    "mmx",
 +    "msr",
 +    "nonstop_tsc",
 +    "pcid",
 +    "pclmuldq",
 +    "pdcm",
 +    "pdpe1gb",
 +    "popcnt",
 +    "pse",
 +    "rdrnd",
 +    "rdtscp",
 +    "sse2",
 +    "sse3",
 +    "sse4a",
 +    "sse4.1",
 +    "sse4.2",
 +    "ssse3",
 +    "tdt",
 +    "x2apic",
 +    "xop"
 +};
 +
 +const char *
 +gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
 +{
 +    "CannotDetect",
 +    "None",
 +    "SSE2",
 +    "SSE4.1",
 +    "AVX_128_FMA",
 +    "AVX_256",
 +    "Sparc64 HPC-ACE"
 +};
 +
 +/* Max length of brand string */
 +#define GMX_CPUID_BRAND_MAXLEN 256
 +
 +
 +/* Contents of the abstract datatype */
 +struct gmx_cpuid
 +{
 +    enum gmx_cpuid_vendor      vendor;
 +    char                       brand[GMX_CPUID_BRAND_MAXLEN];
 +    int                        family;
 +    int                        model;
 +    int                        stepping;
 +    /* Not using gmx_bool here, since this file must be possible to compile without simple.h */
 +    char                       feature[GMX_CPUID_NFEATURES];
 +
 +    /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between
 +     * operating systems and sometimes even settings. For most other architectures you can likely just check
 +     * the documentation and then write static information to these arrays rather than detecting on-the-fly.
 +     */
 +    int                        have_cpu_topology;
 +    int                        nproc;               /* total number of logical processors from OS */
 +    int                        npackages;
 +    int                        ncores_per_package;
 +    int                        nhwthreads_per_core;
 +    int *                      package_id;
 +    int *                      core_id;             /* Local core id in each package */
 +    int *                      hwthread_id;         /* Local hwthread id in each core */
 +    int *                      locality_order;      /* Processor indices sorted in locality order */
 +};
 +
 +
 +/* Simple routines to access the data structure. The initialization routine is
 + * further down since that needs to call other static routines in this file.
 + */
 +enum gmx_cpuid_vendor
 +gmx_cpuid_vendor            (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->vendor;
 +}
 +
 +
 +const char *
 +gmx_cpuid_brand             (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->brand;
 +}
 +
 +int
 +gmx_cpuid_family            (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->family;
 +}
 +
 +int
 +gmx_cpuid_model             (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->model;
 +}
 +
 +int
 +gmx_cpuid_stepping          (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->stepping;
 +}
 +
 +int
 +gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
 +                             enum gmx_cpuid_feature     feature)
 +{
 +    return (cpuid->feature[feature] != 0);
 +}
 +
 +
 +
 +
 +/* What type of acceleration was compiled in, if any?
 + * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
 + * AVX too, so it is important that they appear last in the list.
 + */
 +#ifdef GMX_X86_AVX_256
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256;
 +#elif defined GMX_X86_AVX_128_FMA
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
 +#elif defined GMX_X86_SSE4_1
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +#elif defined GMX_X86_SSE2
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
 +#else
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_NONE;
 +#endif
 +
 +
 +#ifdef GMX_CPUID_X86
 +
 +/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
 + * contents of register output is returned. See Intel/AMD docs for details.
 + *
 + * This version supports extended information where we can also have an input
 + * value in the ecx register. This is ignored for most levels, but some of them
 + * (e.g. level 0xB on Intel) use it.
 + */
 +static int
 +execute_x86cpuid(unsigned int   level,
 +                 unsigned int   ecxval,
 +                 unsigned int * eax,
 +                 unsigned int * ebx,
 +                 unsigned int * ecx,
 +                 unsigned int * edx)
 +{
 +    int rc = 0;
 +
 +    /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
 +     * if the compiler handles GNU-style inline assembly.
 +     */
 +
 +#if (defined _MSC_VER)
 +    int CPUInfo[4];
 +
 +#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
 +    /* MSVC 9.0 SP1 or later */
 +    __cpuidex(CPUInfo, level, ecxval);
 +    rc = 0;
 +#else
 +    __cpuid(CPUInfo, level);
 +    /* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
 +    rc = (ecxval > 0) ? -1 : 0;
 +#endif
 +    *eax = CPUInfo[0];
 +    *ebx = CPUInfo[1];
 +    *ecx = CPUInfo[2];
 +    *edx = CPUInfo[3];
 +
 +#elif (defined GMX_X86_GCC_INLINE_ASM)
 +    /* for now this means GMX_X86_GCC_INLINE_ASM should be defined,
 +     * but there might be more options added in the future.
 +     */
 +    *eax = level;
 +    *ecx = ecxval;
 +    *ebx = 0;
 +    *edx = 0;
 +#if defined(__i386__) && defined(__PIC__)
 +    /* Avoid clobbering the global offset table in 32-bit pic code (ebx register) */
 +    __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
 +                          "cpuid            \n\t"
 +                          "xchgl %%ebx, %1  \n\t"
 +                          : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
 +#else
 +    /* i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want :-) */
 +    __asm__ __volatile__ ("cpuid            \n\t"
 +                          : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
 +#endif
 +    rc = 0;
 +#else
 +    /* Death and horror!
 +     * Apparently this is an x86 platform where we don't know how to call cpuid.
 +     *
 +     * This is REALLY bad, since we will lose all Gromacs acceleration.
 +     */
 +    *eax = 0;
 +    *ebx = 0;
 +    *ecx = 0;
 +    *edx = 0;
 +
 +    rc = -1;
 +#endif
 +    return rc;
 +}
 +
 +
 +/* Identify CPU features common to Intel & AMD - mainly brand string,
 + * version and some features. Vendor has already been detected outside this.
 + */
 +static int
 +cpuid_check_common_x86(gmx_cpuid_t                cpuid)
 +{
 +    int                       fn, max_stdfn, max_extfn;
 +    unsigned int              eax, ebx, ecx, edx;
 +    char                      str[GMX_CPUID_BRAND_MAXLEN];
 +    char *                    p;
 +
 +    /* Find largest standard/extended function input value */
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    p = str;
 +    if (max_extfn >= 0x80000005)
 +    {
 +        /* Get CPU brand string */
 +        for (fn = 0x80000002; fn < 0x80000005; fn++)
 +        {
 +            execute_x86cpuid(fn, 0, &eax, &ebx, &ecx, &edx);
 +            memcpy(p, &eax, 4);
 +            memcpy(p+4, &ebx, 4);
 +            memcpy(p+8, &ecx, 4);
 +            memcpy(p+12, &edx, 4);
 +            p += 16;
 +        }
 +        *p = '\0';
 +
 +        /* Remove empty initial space */
 +        p = str;
 +        while (isspace(*(p)))
 +        {
 +            p++;
 +        }
 +        strncpy(cpuid->brand, p, GMX_CPUID_BRAND_MAXLEN);
 +    }
 +    else
 +    {
 +        strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
 +    }
 +
 +    /* Find basic CPU properties */
 +    if (max_stdfn >= 1)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +
 +        cpuid->family   = ((eax & 0x0FF00000) >> 20) + ((eax & 0x00000F00) >> 8);
 +        /* Note that extended model should be shifted left 4, so only shift right 12 iso 16. */
 +        cpuid->model    = ((eax & 0x000F0000) >> 12) + ((eax & 0x000000F0) >> 4);
 +        cpuid->stepping = (eax & 0x0000000F);
 +
 +        /* Feature flags common to AMD and intel */
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE3]     = (ecx & (1 << 0))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PCLMULDQ] = (ecx & (1 << 1))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSSE3]    = (ecx & (1 << 9))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA]      = (ecx & (1 << 12)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CX16]     = (ecx & (1 << 13)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_1]   = (ecx & (1 << 19)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_2]   = (ecx & (1 << 20)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_POPCNT]   = (ecx & (1 << 23)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AES]      = (ecx & (1 << 25)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX]      = (ecx & (1 << 28)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_F16C]     = (ecx & (1 << 29)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_RDRND]    = (ecx & (1 << 30)) != 0;
 +
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PSE]      = (edx & (1 << 3))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MSR]      = (edx & (1 << 5))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CX8]      = (edx & (1 << 8))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_APIC]     = (edx & (1 << 9))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CMOV]     = (edx & (1 << 15)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CLFSH]    = (edx & (1 << 19)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MMX]      = (edx & (1 << 23)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE2]     = (edx & (1 << 26)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_HTT]      = (edx & (1 << 28)) != 0;
 +    }
 +    else
 +    {
 +        cpuid->family   = -1;
 +        cpuid->model    = -1;
 +        cpuid->stepping = -1;
 +    }
 +
 +    if (max_extfn >= 0x80000001)
 +    {
 +        execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_LAHF_LM] = (ecx & (1 << 0))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PDPE1GB] = (edx & (1 << 26)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_RDTSCP]  = (edx & (1 << 27)) != 0;
 +    }
 +
 +    if (max_extfn >= 0x80000007)
 +    {
 +        execute_x86cpuid(0x80000007, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC]  = (edx & (1 << 8))  != 0;
 +    }
 +    return 0;
 +}
 +
 +/* This routine returns the number of unique different elements found in the array,
 + * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2}
 + * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
 + * number of unique elements.
 + */
 +static int
 +cpuid_renumber_elements(int *data, int n)
 +{
 +    int *unique;
 +    int  i, j, nunique, found;
 +
 +    unique = malloc(sizeof(int)*n);
 +
 +    nunique = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        for (j = 0, found = 0; j < nunique && !found; j++)
 +        {
 +            found = (data[i] == unique[j]);
 +        }
 +        if (!found)
 +        {
 +            /* Insert in sorted order! */
 +            for (j = nunique++; j > 0 && unique[j-1] > data[i]; j--)
 +            {
 +                unique[j] = unique[j-1];
 +            }
 +            unique[j] = data[i];
 +        }
 +    }
 +    /* renumber */
 +    for (i = 0; i < n; i++)
 +    {
 +        for (j = 0; j < nunique; j++)
 +        {
 +            if (data[i] == unique[j])
 +            {
 +                data[i] = j;
 +            }
 +        }
 +    }
 +    return nunique;
 +}
 +
 +/* APIC IDs, or everything you wanted to know about your x86 cores but were afraid to ask...
 + *
 + * Raw APIC IDs are unfortunately somewhat dirty. For technical reasons they are assigned
 + * in power-of-2 chunks, and even then there are no guarantees about specific numbers - all
 + * we know is that the part for each thread/core/package is unique, and how many bits are
 + * reserved for that part.
 + * This routine does internal renumbering so we get continuous indices, and also
 + * decodes the actual number of packages,cores-per-package and hwthreads-per-core.
++ * Returns: 0 on success, non-zero on failure.
 + */
- static void
++static int
 +cpuid_x86_decode_apic_id(gmx_cpuid_t cpuid, int *apic_id, int core_bits, int hwthread_bits)
 +{
 +    int i, idx;
 +    int hwthread_mask, core_mask_after_shift;
 +
 +    cpuid->hwthread_id     = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->core_id         = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->package_id      = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->locality_order  = malloc(sizeof(int)*cpuid->nproc);
 +
 +    hwthread_mask         = (1 << hwthread_bits) - 1;
 +    core_mask_after_shift = (1 << core_bits) - 1;
 +
 +    for (i = 0; i < cpuid->nproc; i++)
 +    {
 +        cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask;
 +        cpuid->core_id[i]     = (apic_id[i] >> hwthread_bits) & core_mask_after_shift;
 +        cpuid->package_id[i]  = apic_id[i] >> (core_bits + hwthread_bits);
 +    }
 +
 +    cpuid->npackages            = cpuid_renumber_elements(cpuid->package_id, cpuid->nproc);
 +    cpuid->ncores_per_package   = cpuid_renumber_elements(cpuid->core_id, cpuid->nproc);
 +    cpuid->nhwthreads_per_core  = cpuid_renumber_elements(cpuid->hwthread_id, cpuid->nproc);
 +
++    /* now check for consistency */
++    if ( (cpuid->npackages * cpuid->ncores_per_package *
++          cpuid->nhwthreads_per_core) != cpuid->nproc )
++    {
++        /* the packages/cores-per-package/hwthreads-per-core counts are
++           inconsistent. */
++        return -1;
++    }
++
 +    /* Create a locality order array, i.e. first all resources in package0, which in turn
 +     * are sorted so we first have all resources in core0, where threads are sorted in order, etc.
 +     */
++
 +    for (i = 0; i < cpuid->nproc; i++)
 +    {
 +        idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i];
 +        cpuid->locality_order[idx] = i;
 +    }
++    return 0;
 +}
 +
 +
 +/* Detection of AMD-specific CPU features */
 +static int
 +cpuid_check_amd_x86(gmx_cpuid_t                cpuid)
 +{
-     int                       max_stdfn, max_extfn;
++    int                       max_stdfn, max_extfn, ret;
 +    unsigned int              eax, ebx, ecx, edx;
 +    int                       hwthread_bits, core_bits;
 +    int *                     apic_id;
 +
 +    cpuid_check_common_x86(cpuid);
 +
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    if (max_extfn >= 0x80000001)
 +    {
 +        execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 +
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4A]       = (ecx & (1 << 6))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MISALIGNSSE] = (ecx & (1 << 7))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_XOP]         = (ecx & (1 << 11)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4]        = (ecx & (1 << 16)) != 0;
 +    }
 +
 +    /* Query APIC information on AMD */
 +    if (max_extfn >= 0x80000008)
 +    {
 +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
 +        /* Linux */
 +        unsigned int   i;
 +        cpu_set_t      cpuset, save_cpuset;
 +        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
 +        apic_id      = malloc(sizeof(int)*cpuid->nproc);
 +        sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +        /* Get APIC id from each core */
 +        CPU_ZERO(&cpuset);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            CPU_SET(i, &cpuset);
 +            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 +            execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = ebx >> 24;
 +            CPU_CLR(i, &cpuset);
 +        }
 +        /* Reset affinity to the value it had when calling this routine */
 +        sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +#define CPUID_HAVE_APIC
 +#elif defined GMX_NATIVE_WINDOWS
 +        /* Windows */
 +        DWORD_PTR     i;
 +        SYSTEM_INFO   sysinfo;
 +        unsigned int  save_affinity, affinity;
 +        GetSystemInfo( &sysinfo );
 +        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
 +        apic_id       = malloc(sizeof(int)*cpuid->nproc);
 +        /* Get previous affinity mask */
 +        save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
 +            Sleep(0);
 +            execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = ebx >> 24;
 +        }
 +        SetThreadAffinityMask(GetCurrentThread(), save_affinity);
 +#define CPUID_HAVE_APIC
 +#endif
 +#ifdef CPUID_HAVE_APIC
 +        /* AMD does not support SMT yet - there are no hwthread bits in apic ID */
 +        hwthread_bits = 0;
 +        /* Get number of core bits in apic ID - try modern extended method first */
 +        execute_x86cpuid(0x80000008, 0, &eax, &ebx, &ecx, &edx);
 +        core_bits = (ecx >> 12) & 0xf;
 +        if (core_bits == 0)
 +        {
 +            /* Legacy method for old single/dual core AMD CPUs */
 +            int i = ecx & 0xF;
 +            for (core_bits = 0; (i>>core_bits) > 0; core_bits++)
 +            {
 +                ;
 +            }
 +        }
-         cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, hwthread_bits);
-         cpuid->have_cpu_topology = 1;
++        ret = cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, 
++                                       hwthread_bits);
++        cpuid->have_cpu_topology = (ret == 0);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +/* Detection of Intel-specific CPU features */
 +static int
 +cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 +{
-     unsigned int              max_stdfn, max_extfn;
++    unsigned int              max_stdfn, max_extfn, ret;
 +    unsigned int              eax, ebx, ecx, edx;
 +    unsigned int              max_logical_cores, max_physical_cores;
 +    int                       hwthread_bits, core_bits;
 +    int *                     apic_id;
 +
 +    cpuid_check_common_x86(cpuid);
 +
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    if (max_stdfn >= 1)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PDCM]    = (ecx & (1 << 15)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PCID]    = (ecx & (1 << 17)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_X2APIC]  = (ecx & (1 << 21)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_TDT]     = (ecx & (1 << 24)) != 0;
 +    }
 +
 +    if (max_stdfn >= 7)
 +    {
 +        execute_x86cpuid(0x7, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX2]    = (ebx & (1 << 5))  != 0;
 +    }
 +
 +    /* Check whether Hyper-Threading is enabled, not only supported */
 +    if (cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] && max_stdfn >= 4)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +        max_logical_cores  = (ebx >> 16) & 0x0FF;
 +        execute_x86cpuid(0x4, 0, &eax, &ebx, &ecx, &edx);
 +        max_physical_cores = ((eax >> 26) & 0x3F) + 1;
 +
 +        /* Clear HTT flag if we only have 1 logical core per physical */
 +        if (max_logical_cores/max_physical_cores < 2)
 +        {
 +            cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0;
 +        }
 +    }
 +
 +    if (max_stdfn >= 0xB)
 +    {
 +        /* Query x2 APIC information from cores */
 +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
 +        /* Linux */
 +        unsigned int   i;
 +        cpu_set_t      cpuset, save_cpuset;
 +        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
 +        apic_id      = malloc(sizeof(int)*cpuid->nproc);
 +        sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +        /* Get x2APIC ID from each hardware thread */
 +        CPU_ZERO(&cpuset);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            CPU_SET(i, &cpuset);
 +            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 +            execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = edx;
 +            CPU_CLR(i, &cpuset);
 +        }
 +        /* Reset affinity to the value it had when calling this routine */
 +        sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +#define CPUID_HAVE_APIC
 +#elif defined GMX_NATIVE_WINDOWS
 +        /* Windows */
 +        DWORD_PTR     i;
 +        SYSTEM_INFO   sysinfo;
 +        unsigned int  save_affinity, affinity;
 +        GetSystemInfo( &sysinfo );
 +        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
 +        apic_id       = malloc(sizeof(int)*cpuid->nproc);
 +        /* Get previous affinity mask */
 +        save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
 +            Sleep(0);
 +            execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = edx;
 +        }
 +        SetThreadAffinityMask(GetCurrentThread(), save_affinity);
 +#define CPUID_HAVE_APIC
 +#endif
 +#ifdef CPUID_HAVE_APIC
 +        execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +        hwthread_bits    = eax & 0x1F;
 +        execute_x86cpuid(0xB, 1, &eax, &ebx, &ecx, &edx);
 +        core_bits        = (eax & 0x1F) - hwthread_bits;
-         cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, hwthread_bits);
-         cpuid->have_cpu_topology = 1;
++        ret = cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, 
++                                       hwthread_bits);
++        cpuid->have_cpu_topology = (ret == 0);
 +#endif
 +    }
 +    return 0;
 +}
 +#endif /* GMX_CPUID_X86 */
 +
 +
 +
 +
 +static void
 +chomp_substring_before_colon(const char *in, char *s, int maxlength)
 +{
 +    char *p;
 +    strncpy(s,in,maxlength);
 +    p = strchr(s,':');
 +    if(p!=NULL)
 +    {
 +        *p='\0';
 +        while(isspace(*(--p)) && (p>=s))
 +        {
 +            *p='\0';
 +        }
 +    }
 +    else
 +    {
 +        *s='\0';
 +    }
 +}
 +
 +static void
 +chomp_substring_after_colon(const char *in, char *s, int maxlength)
 +{
 +    char *p;
 +    if( (p = strchr(in,':'))!=NULL)
 +    {
 +        p++;
 +        while(isspace(*p)) p++;
 +        strncpy(s,p,maxlength);
 +        p = s+strlen(s);
 +        while(isspace(*(--p)) && (p>=s))
 +        {
 +            *p='\0';
 +        }
 +    }
 +    else
 +    {
 +        *s='\0';
 +    }
 +}
 +
 +/* Try to find the vendor of the current CPU, so we know what specific
 + * detection routine to call.
 + */
 +static enum gmx_cpuid_vendor
 +cpuid_check_vendor(void)
 +{
 +    enum gmx_cpuid_vendor      i, vendor;
 +    /* Register data used on x86 */
 +    unsigned int               eax, ebx, ecx, edx;
 +    char                       vendorstring[13];
 +    FILE *                     fp;
 +    char                       buffer[255],buffer2[255];
 +
 +    /* Set default first */
 +    vendor = GMX_CPUID_VENDOR_UNKNOWN;
 +
 +#ifdef GMX_CPUID_X86
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +
 +    memcpy(vendorstring, &ebx, 4);
 +    memcpy(vendorstring+4, &edx, 4);
 +    memcpy(vendorstring+8, &ecx, 4);
 +
 +    vendorstring[12] = '\0';
 +
 +    for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++)
 +    {
 +        if (!strncmp(vendorstring, gmx_cpuid_vendor_string[i], 12))
 +        {
 +            vendor = i;
 +        }
 +    }
 +#elif defined(__linux__) || defined(__linux)
 +    /* General Linux. Try to get CPU vendor from /proc/cpuinfo */
 +    if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
 +    {
 +        while( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer,sizeof(buffer),fp) != NULL))
 +        {
 +            chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
 +            /* Intel/AMD use "vendor_id", IBM "vendor". Fujitsu "manufacture". Add others if you have them! */
 +            if( !strcmp(buffer2,"vendor_id") || !strcmp(buffer2,"vendor") || !strcmp(buffer2,"manufacture") )
 +            {
 +                chomp_substring_after_colon(buffer,buffer2,sizeof(buffer2));
 +                for(i=GMX_CPUID_VENDOR_UNKNOWN; i<GMX_CPUID_NVENDORS; i++)
 +                {
 +                    /* Be liberal and accept if we find the vendor anywhere in string */
 +                    if(strstr(buffer2,gmx_cpuid_vendor_string[i]))
 +                    {
 +                        vendor = i;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    fclose(fp);
 +#else
 +    vendor = GMX_CPUID_VENDOR_UNKNOWN;
 +#endif
 +
 +    return vendor;
 +}
 +
 +
 +
 +int
 +gmx_cpuid_topology(gmx_cpuid_t        cpuid,
 +                   int *              nprocessors,
 +                   int *              npackages,
 +                   int *              ncores_per_package,
 +                   int *              nhwthreads_per_core,
 +                   const int **       package_id,
 +                   const int **       core_id,
 +                   const int **       hwthread_id,
 +                   const int **       locality_order)
 +{
 +    int rc;
 +
 +    if (cpuid->have_cpu_topology)
 +    {
 +        *nprocessors          = cpuid->nproc;
 +        *npackages            = cpuid->npackages;
 +        *ncores_per_package   = cpuid->ncores_per_package;
 +        *nhwthreads_per_core  = cpuid->nhwthreads_per_core;
 +        *package_id           = cpuid->package_id;
 +        *core_id              = cpuid->core_id;
 +        *hwthread_id          = cpuid->hwthread_id;
 +        *locality_order       = cpuid->locality_order;
 +        rc                    = 0;
 +    }
 +    else
 +    {
 +        rc = -1;
 +    }
 +    return rc;
 +}
 +
 +
 +enum gmx_cpuid_x86_smt
 +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
 +{
 +    enum gmx_cpuid_x86_smt rc;
 +
 +    if (cpuid->have_cpu_topology)
 +    {
 +        rc = (cpuid->nhwthreads_per_core > 1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED;
 +    }
 +    else if (cpuid->vendor == GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_HTT) == 0)
 +    {
 +        rc = GMX_CPUID_X86_SMT_DISABLED;
 +    }
 +    else
 +    {
 +        rc = GMX_CPUID_X86_SMT_CANNOTDETECT;
 +    }
 +    return rc;
 +}
 +
 +
 +int
 +gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
 +{
 +    gmx_cpuid_t cpuid;
 +    int         i;
 +    FILE *      fp;
 +    char        buffer[255],buffer2[255];
 +    int         found_brand;
 +
 +    cpuid = malloc(sizeof(*cpuid));
 +
 +    *pcpuid = cpuid;
 +
 +    for (i = 0; i < GMX_CPUID_NFEATURES; i++)
 +    {
 +        cpuid->feature[i] = 0;
 +    }
 +
 +    cpuid->have_cpu_topology   = 0;
 +    cpuid->nproc               = 0;
 +    cpuid->npackages           = 0;
 +    cpuid->ncores_per_package  = 0;
 +    cpuid->nhwthreads_per_core = 0;
 +    cpuid->package_id          = NULL;
 +    cpuid->core_id             = NULL;
 +    cpuid->hwthread_id         = NULL;
 +    cpuid->locality_order      = NULL;
 +
 +    cpuid->vendor = cpuid_check_vendor();
 +
 +    switch (cpuid->vendor)
 +    {
 +#ifdef GMX_CPUID_X86
 +        case GMX_CPUID_VENDOR_INTEL:
 +            cpuid_check_intel_x86(cpuid);
 +            break;
 +        case GMX_CPUID_VENDOR_AMD:
 +            cpuid_check_amd_x86(cpuid);
 +            break;
 +#endif
 +        default:
 +            /* Default value */
 +            strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
 +#if defined(__linux__) || defined(__linux)
 +            /* General Linux. Try to get CPU type from /proc/cpuinfo */
 +            if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
 +            {
 +                found_brand = 0;
 +                while( (found_brand==0) && (fgets(buffer,sizeof(buffer),fp) !=NULL))
 +                {
 +                    chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
 +                    /* Intel uses "model name", Fujitsu and IBM "cpu". */
 +                    if( !strcmp(buffer2,"model name") || !strcmp(buffer2,"cpu"))
 +                    {
 +                        chomp_substring_after_colon(buffer,cpuid->brand,GMX_CPUID_BRAND_MAXLEN);
 +                        found_brand = 1;
 +                    }
 +                }
 +            }
 +            fclose(fp);
 +#endif
 +            cpuid->family         = 0;
 +            cpuid->model          = 0;
 +            cpuid->stepping       = 0;
 +            
 +            for(i=0; i<GMX_CPUID_NFEATURES; i++)
 +            {
 +                cpuid->feature[i]=0;
 +            }
 +            cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1;
 +            break;
 +    }
 +    return 0;
 +}
 +
 +
 +
 +void
 +gmx_cpuid_done               (gmx_cpuid_t              cpuid)
 +{
 +    free(cpuid);
 +}
 +
 +
 +int
 +gmx_cpuid_formatstring       (gmx_cpuid_t              cpuid,
 +                              char *                   str,
 +                              int                      n)
 +{
 +    int                     c;
 +    int                     i;
 +    enum gmx_cpuid_feature  feature;
 +
 +#ifdef _MSC_VER
 +    _snprintf(str, n,
 +              "Vendor: %s\n"
 +              "Brand:  %s\n"
 +              "Family: %2d  Model: %2d  Stepping: %2d\n"
 +              "Features:",
 +              gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
 +              gmx_cpuid_brand(cpuid),
 +              gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 +#else
 +    snprintf(str, n,
 +             "Vendor: %s\n"
 +             "Brand:  %s\n"
 +             "Family: %2d  Model: %2d  Stepping: %2d\n"
 +             "Features:",
 +             gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
 +             gmx_cpuid_brand(cpuid),
 +             gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 +#endif
 +
 +    str[n-1] = '\0';
 +    c        = strlen(str);
 +    n       -= c;
 +    str     += c;
 +
 +    for (feature = GMX_CPUID_FEATURE_CANNOTDETECT; feature < GMX_CPUID_NFEATURES; feature++)
 +    {
 +        if (gmx_cpuid_feature(cpuid, feature) == 1)
 +        {
 +#ifdef _MSC_VER
 +            _snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]);
 +#else
 +            snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]);
 +#endif
 +            str[n-1] = '\0';
 +            c        = strlen(str);
 +            n       -= c;
 +            str     += c;
 +        }
 +    }
 +#ifdef _MSC_VER
 +    _snprintf(str, n, "\n");
 +#else
 +    snprintf(str, n, "\n");
 +#endif
 +    str[n-1] = '\0';
 +
 +    return 0;
 +}
 +
 +
 +
 +enum gmx_cpuid_acceleration
 +gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
 +{
 +    enum gmx_cpuid_acceleration  tmpacc;
 +
 +    tmpacc = GMX_CPUID_ACCELERATION_NONE;
 +
 +    if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_INTEL)
 +    {
 +        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +        }
 +    }
 +    else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_AMD)
 +    {
 +        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +        }
 +    }
 +    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_FUJITSU)
 +    {
 +        if(strstr(gmx_cpuid_brand(cpuid),"SPARC64"))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
 +        }
 +    }
 +    return tmpacc;
 +}
 +
 +
 +
 +int
 +gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 +                             FILE *        log)
 +{
 +    int                           rc;
 +    char                          str[1024];
 +    enum gmx_cpuid_acceleration   acc;
 +
 +    acc = gmx_cpuid_acceleration_suggest(cpuid);
 +
 +    rc = (acc != compiled_acc);
 +
 +    gmx_cpuid_formatstring(cpuid, str, 1023);
 +    str[1023] = '\0';
 +
 +    if (log != NULL)
 +    {
 +        fprintf(log,
 +                "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n"
 +                "%s"
 +                "Acceleration most likely to fit this hardware: %s\n"
 +                "Acceleration selected at GROMACS compile time: %s\n\n",
 +                str,
 +                gmx_cpuid_acceleration_string[acc],
 +                gmx_cpuid_acceleration_string[compiled_acc]);
 +    }
 +
 +    if (rc != 0)
 +    {
 +        if (log != NULL)
 +        {
 +            fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
 +                    "Acceleration most likely to fit this hardware: %s\n"
 +                    "Acceleration selected at GROMACS compile time: %s\n\n",
 +                    gmx_cpuid_acceleration_string[acc],
 +                    gmx_cpuid_acceleration_string[compiled_acc]);
 +        }
 +        printf("Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
 +               gmx_cpuid_acceleration_string[compiled_acc],
 +               gmx_cpuid_acceleration_string[acc]);
 +    }
 +    return rc;
 +}
 +
 +
 +#ifdef GMX_CPUID_STANDALONE
 +/* Stand-alone program to enable queries of CPU features from Cmake.
 + * Note that you need to check inline ASM capabilities before compiling and set
 + * -DGMX_X86_GCC_INLINE_ASM for the cpuid instruction to work...
 + */
 +int
 +main(int argc, char **argv)
 +{
 +    gmx_cpuid_t                   cpuid;
 +    enum gmx_cpuid_acceleration   acc;
 +    int                           i, cnt;
 +
 +    if (argc < 2)
 +    {
 +        fprintf(stdout,
 +                "Usage:\n\n%s [flags]\n\n"
 +                "Available flags:\n"
 +                "-vendor        Print CPU vendor.\n"
 +                "-brand         Print CPU brand string.\n"
 +                "-family        Print CPU family version.\n"
 +                "-model         Print CPU model version.\n"
 +                "-stepping      Print CPU stepping version.\n"
 +                "-features      Print CPU feature flags.\n"
 +                "-acceleration  Print suggested GROMACS acceleration.\n",
 +                argv[0]);
 +        exit(0);
 +    }
 +
 +    gmx_cpuid_init(&cpuid);
 +
 +    if (!strncmp(argv[1], "-vendor", 3))
 +    {
 +        printf("%s\n", gmx_cpuid_vendor_string[cpuid->vendor]);
 +    }
 +    else if (!strncmp(argv[1], "-brand", 3))
 +    {
 +        printf("%s\n", cpuid->brand);
 +    }
 +    else if (!strncmp(argv[1], "-family", 3))
 +    {
 +        printf("%d\n", cpuid->family);
 +    }
 +    else if (!strncmp(argv[1], "-model", 3))
 +    {
 +        printf("%d\n", cpuid->model);
 +    }
 +    else if (!strncmp(argv[1], "-stepping", 3))
 +    {
 +        printf("%d\n", cpuid->stepping);
 +    }
 +    else if (!strncmp(argv[1], "-features", 3))
 +    {
 +        cnt = 0;
 +        for (i = 0; i < GMX_CPUID_NFEATURES; i++)
 +        {
 +            if (cpuid->feature[i] == 1)
 +            {
 +                if (cnt++ > 0)
 +                {
 +                    printf(" ");
 +                }
 +                printf("%s", gmx_cpuid_feature_string[i]);
 +            }
 +        }
 +        printf("\n");
 +    }
 +    else if (!strncmp(argv[1], "-acceleration", 3))
 +    {
 +        acc = gmx_cpuid_acceleration_suggest(cpuid);
 +        fprintf(stdout, "%s\n", gmx_cpuid_acceleration_string[acc]);
 +    }
 +
 +    gmx_cpuid_done(cpuid);
 +
 +
 +    return 0;
 +}
 +
 +#endif
diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h
index 7aeb7cb841,0000000000..2498f11c4e
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h
@@@ -1,1482 -1,0 +1,1477 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifndef _gmx_math_x86_sse4_1_double_h_
 +#define _gmx_math_x86_sse4_1_double_h_
 +
 +#include <stdio.h>
 +#include <math.h>
 +
 +#include "gmx_x86_sse4_1.h"
 +
 +
 +
 +#ifndef M_PI
 +#  define M_PI 3.14159265358979323846264338327950288
 +#endif
 +
 +/************************
 + *                      *
 + * Simple math routines *
 + *                      *
 + ************************/
 +
 +/* 1.0/sqrt(x) */
 +static gmx_inline __m128d
 +gmx_mm_invsqrt_pd(__m128d x)
 +{
 +    const __m128d half  = _mm_set1_pd(0.5);
 +    const __m128d three = _mm_set1_pd(3.0);
 +
 +    /* Lookup instruction only exists in single precision, convert back and forth... */
 +    __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
 +
 +    lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
 +    return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
 +}
 +
 +/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
 +static void
 +gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
 +{
 +    const __m128d half   = _mm_set1_pd(0.5);
 +    const __m128d three  = _mm_set1_pd(3.0);
 +    const __m128  halff  = _mm_set1_ps(0.5f);
 +    const __m128  threef = _mm_set1_ps(3.0f);
 +
 +    __m128        xf, luf;
 +    __m128d       lu1, lu2;
 +
 +    /* Do first N-R step in float for 2x throughput */
 +    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
 +    luf = _mm_rsqrt_ps(xf);
 +    luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
 +
 +    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
 +    lu1 = _mm_cvtps_pd(luf);
 +
 +    *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
 +    *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
 +}
 +
 +/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
 +static gmx_inline __m128d
 +gmx_mm_sqrt_pd(__m128d x)
 +{
 +    __m128d mask;
 +    __m128d res;
 +
 +    mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
 +    res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
 +
 +    res  = _mm_mul_pd(x, res);
 +
 +    return res;
 +}
 +
 +/* 1.0/x */
 +static gmx_inline __m128d
 +gmx_mm_inv_pd(__m128d x)
 +{
 +    const __m128d two  = _mm_set1_pd(2.0);
 +
 +    /* Lookup instruction only exists in single precision, convert back and forth... */
 +    __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
 +
 +    /* Perform two N-R steps for double precision */
 +    lu         = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
 +    return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
 +}
 +
 +static gmx_inline __m128d
 +gmx_mm_abs_pd(__m128d x)
 +{
 +    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 +
 +    return _mm_and_pd(x, signmask);
 +}
 +
 +
 +/*
 + * 2^x function.
 + *
 + * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 + * [-0.5,0.5].
 + *
 + * The approximation on [-0.5,0.5] is a rational PadÃ© approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
 + * according to the same algorithm as used in the Cephes/netlib math routines.
 + */
 +static __m128d
 +gmx_mm_exp2_pd(__m128d x)
 +{
 +    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 +    const __m128d arglimit = _mm_set1_pd(1022.0);
 +    const __m128i expbase  = _mm_set1_epi32(1023);
 +
 +    const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
 +    const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
 +    const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
 +    /* Q2 == 1.0 */
 +    const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
 +    const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
 +    const __m128d one      = _mm_set1_pd(1.0);
 +    const __m128d two      = _mm_set1_pd(2.0);
 +
 +    __m128d       valuemask;
 +    __m128i       iexppart;
 +    __m128d       fexppart;
 +    __m128d       intpart;
 +    __m128d       z, z2;
 +    __m128d       PolyP, PolyQ;
 +
 +    iexppart  = _mm_cvtpd_epi32(x);
 +    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
 +
 +    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
 +     * To be able to shift it into the exponent for a double precision number we first need to
 +     * shuffle so that the lower half contains the first element, and the upper half the second.
 +     * This should really be done as a zero-extension, but since the next instructions will shift
 +     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
 +     * (thus we just use element 2 from iexppart).
 +     */
 +    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
 +
 +    /* Do the shift operation on the 64-bit registers */
 +    iexppart  = _mm_add_epi32(iexppart, expbase);
 +    iexppart  = _mm_slli_epi64(iexppart, 52);
 +
 +    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
 +    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
 +
 +    z         = _mm_sub_pd(x, intpart);
 +    z2        = _mm_mul_pd(z, z);
 +
 +    PolyP     = _mm_mul_pd(P2, z2);
 +    PolyP     = _mm_add_pd(PolyP, P1);
 +    PolyQ     = _mm_add_pd(z2, Q1);
 +    PolyP     = _mm_mul_pd(PolyP, z2);
 +    PolyQ     = _mm_mul_pd(PolyQ, z2);
 +    PolyP     = _mm_add_pd(PolyP, P0);
 +    PolyQ     = _mm_add_pd(PolyQ, Q0);
 +    PolyP     = _mm_mul_pd(PolyP, z);
 +
 +    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
 +    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
 +
 +    z         = _mm_mul_pd(z, fexppart);
 +
 +    return z;
 +}
 +
 +/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 + * but there will then be a small rounding error since we lose some precision due to the
 + * multiplication. This will then be magnified a lot by the exponential.
 + *
 + * Instead, we calculate the fractional part directly as a PadÃ© approximation of
 + * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 + * remaining after 2^y, which avoids the precision-loss.
 + */
 +static __m128d
 +gmx_mm_exp_pd(__m128d exparg)
 +{
 +    const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
 +    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 +    const __m128d arglimit = _mm_set1_pd(1022.0);
 +    const __m128i expbase  = _mm_set1_epi32(1023);
 +
 +    const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
 +    const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
 +
 +    const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
 +    const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
 +    /* P0 == 1.0 */
 +    const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
 +    const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
 +    const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
 +    /* Q0 == 2.0 */
 +    const __m128d one      = _mm_set1_pd(1.0);
 +    const __m128d two      = _mm_set1_pd(2.0);
 +
 +    __m128d       valuemask;
 +    __m128i       iexppart;
 +    __m128d       fexppart;
 +    __m128d       intpart;
 +    __m128d       x, z, z2;
 +    __m128d       PolyP, PolyQ;
 +
 +    x             = _mm_mul_pd(exparg, argscale);
 +
 +    iexppart  = _mm_cvtpd_epi32(x);
 +    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
 +
 +    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
 +     * To be able to shift it into the exponent for a double precision number we first need to
 +     * shuffle so that the lower half contains the first element, and the upper half the second.
 +     * This should really be done as a zero-extension, but since the next instructions will shift
 +     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
 +     * (thus we just use element 2 from iexppart).
 +     */
 +    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
 +
 +    /* Do the shift operation on the 64-bit registers */
 +    iexppart  = _mm_add_epi32(iexppart, expbase);
 +    iexppart  = _mm_slli_epi64(iexppart, 52);
 +
 +    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
 +    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
 +
 +    z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
 +    z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
 +
 +    z2        = _mm_mul_pd(z, z);
 +
 +    PolyQ     = _mm_mul_pd(Q3, z2);
 +    PolyQ     = _mm_add_pd(PolyQ, Q2);
 +    PolyP     = _mm_mul_pd(P2, z2);
 +    PolyQ     = _mm_mul_pd(PolyQ, z2);
 +    PolyP     = _mm_add_pd(PolyP, P1);
 +    PolyQ     = _mm_add_pd(PolyQ, Q1);
 +    PolyP     = _mm_mul_pd(PolyP, z2);
 +    PolyQ     = _mm_mul_pd(PolyQ, z2);
 +    PolyP     = _mm_add_pd(PolyP, one);
 +    PolyQ     = _mm_add_pd(PolyQ, two);
 +
 +    PolyP     = _mm_mul_pd(PolyP, z);
 +
 +    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
 +    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
 +
 +    z         = _mm_mul_pd(z, fexppart);
 +
 +    return z;
 +}
 +
 +
 +
 +static __m128d
 +gmx_mm_log_pd(__m128d x)
 +{
 +    /* Same algorithm as cephes library */
 +    const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
 +
 +    const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
 +
 +    const __m128d half       = _mm_set1_pd(0.5);
 +    const __m128d one        = _mm_set1_pd(1.0);
 +    const __m128d two        = _mm_set1_pd(2.0);
 +    const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
 +
 +    const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
 +    const __m128d corr2      = _mm_set1_pd(0.693359375);
 +
 +    const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
 +    const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
 +    const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
 +    const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
 +    const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
 +    const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
 +
 +    const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
 +    const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
 +    const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
 +    const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
 +    const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
 +
 +    const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
 +    const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
 +    const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
 +
 +    const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
 +    const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
 +    const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
 +
 +    __m128d       fexp;
 +    __m128i       iexp;
 +
 +    __m128d       mask1, mask2;
 +    __m128d       corr, t1, t2, q;
 +    __m128d       zA, yA, xA, zB, yB, xB, z;
 +    __m128d       polyR, polyS;
 +    __m128d       polyP1, polyP2, polyQ1, polyQ2;
 +
 +    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 +    fexp   = _mm_and_pd(x, expmask);
 +    iexp   = gmx_mm_castpd_si128(fexp);
 +    iexp   = _mm_srli_epi64(iexp, 52);
 +    iexp   = _mm_sub_epi32(iexp, expbase_m1);
 +    iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
 +    fexp   = _mm_cvtepi32_pd(iexp);
 +
 +    x      = _mm_andnot_pd(expmask, x);
 +    x      = _mm_or_pd(x, one);
 +    x      = _mm_mul_pd(x, half);
 +
 +    mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
 +    mask2     = _mm_cmplt_pd(x, invsq2);
 +
 +    fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
 +
 +    /* If mask1 is set ('A') */
 +    zA     = _mm_sub_pd(x, half);
 +    t1     = _mm_blendv_pd( zA, x, mask2 );
 +    zA     = _mm_sub_pd(t1, half);
 +    t2     = _mm_blendv_pd( x, zA, mask2 );
 +    yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
 +
 +    xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
 +    zA     = _mm_mul_pd(xA, xA);
 +
 +    /* EVALUATE POLY */
 +    polyR  = _mm_mul_pd(R2, zA);
 +    polyR  = _mm_add_pd(polyR, R1);
 +    polyR  = _mm_mul_pd(polyR, zA);
 +    polyR  = _mm_add_pd(polyR, R0);
 +
 +    polyS  = _mm_add_pd(zA, S2);
 +    polyS  = _mm_mul_pd(polyS, zA);
 +    polyS  = _mm_add_pd(polyS, S1);
 +    polyS  = _mm_mul_pd(polyS, zA);
 +    polyS  = _mm_add_pd(polyS, S0);
 +
 +    q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
 +    zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
 +
 +    zA     = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
 +    zA     = _mm_add_pd(zA, xA);
 +    zA     = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
 +
 +    /* If mask1 is not set ('B') */
 +    corr   = _mm_and_pd(mask2, x);
 +    xB     = _mm_add_pd(x, corr);
 +    xB     = _mm_sub_pd(xB, one);
 +    zB     = _mm_mul_pd(xB, xB);
 +
 +    polyP1 = _mm_mul_pd(P5, zB);
 +    polyP2 = _mm_mul_pd(P4, zB);
 +    polyP1 = _mm_add_pd(polyP1, P3);
 +    polyP2 = _mm_add_pd(polyP2, P2);
 +    polyP1 = _mm_mul_pd(polyP1, zB);
 +    polyP2 = _mm_mul_pd(polyP2, zB);
 +    polyP1 = _mm_add_pd(polyP1, P1);
 +    polyP2 = _mm_add_pd(polyP2, P0);
 +    polyP1 = _mm_mul_pd(polyP1, xB);
 +    polyP1 = _mm_add_pd(polyP1, polyP2);
 +
 +    polyQ2 = _mm_mul_pd(Q4, zB);
 +    polyQ1 = _mm_add_pd(zB, Q3);
 +    polyQ2 = _mm_add_pd(polyQ2, Q2);
 +    polyQ1 = _mm_mul_pd(polyQ1, zB);
 +    polyQ2 = _mm_mul_pd(polyQ2, zB);
 +    polyQ1 = _mm_add_pd(polyQ1, Q1);
 +    polyQ2 = _mm_add_pd(polyQ2, Q0);
 +    polyQ1 = _mm_mul_pd(polyQ1, xB);
 +    polyQ1 = _mm_add_pd(polyQ1, polyQ2);
 +
 +    fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
 +
 +    q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
 +    yB     = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
 +
 +    yB     = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
 +    yB     = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
 +    zB     = _mm_add_pd(xB, yB);
 +    zB     = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
 +
 +    z      = _mm_blendv_pd( zB, zA, mask1 );
 +
 +    return z;
 +}
 +
 +
 +static __m128d
 +gmx_mm_erf_pd(__m128d x)
 +{
 +    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
 +    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
 +    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
 +    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
 +    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
 +    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
 +
 +    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
 +    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
 +    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
 +    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
 +    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
 +    /* CAQ0 == 1.0 */
 +    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
 +
 +    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
 +    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
 +    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
 +    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
 +    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
 +    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
 +    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
 +    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
 +    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
 +    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
 +    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
 +    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
 +    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
 +    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
 +    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
 +    /* CBQ0 == 1.0 */
 +
 +    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
 +    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
 +    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
 +    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
 +    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
 +    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
 +    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
 +    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
 +
 +    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
 +    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
 +    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
 +    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
 +    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
 +    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
 +    /* CCQ0 == 1.0 */
 +    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
 +
 +    const __m128d one       = _mm_set1_pd(1.0);
 +    const __m128d two       = _mm_set1_pd(2.0);
 +
 +    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
 +
 +    __m128d       xabs, x2, x4, t, t2, w, w2;
 +    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
 +    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
 +    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
 +    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
 +    __m128d       mask, expmx2;
 +
 +    /* Calculate erf() */
 +    xabs     = gmx_mm_abs_pd(x);
 +    x2       = _mm_mul_pd(x, x);
 +    x4       = _mm_mul_pd(x2, x2);
 +
 +    PolyAP0  = _mm_mul_pd(CAP4, x4);
 +    PolyAP1  = _mm_mul_pd(CAP3, x4);
 +    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
 +    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
 +    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
 +    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
 +    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
 +    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
 +
 +    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
 +    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
 +    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
 +    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
 +    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
 +    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
 +    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
 +    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
 +    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
 +    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
 +
 +    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
 +    res_erf  = _mm_add_pd(CAoffset, res_erf);
 +    res_erf  = _mm_mul_pd(x, res_erf);
 +
 +    /* Calculate erfc() in range [1,4.5] */
 +    t       = _mm_sub_pd(xabs, one);
 +    t2      = _mm_mul_pd(t, t);
 +
 +    PolyBP0  = _mm_mul_pd(CBP6, t2);
 +    PolyBP1  = _mm_mul_pd(CBP5, t2);
 +    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
 +    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
 +    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 +    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
 +    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
 +    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
 +    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 +    PolyBP1  = _mm_mul_pd(PolyBP1, t);
 +    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
 +    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
 +
 +    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
 +    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
 +    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
 +    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 +    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 +    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
 +    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 +    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 +    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
 +    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
 +
 +    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
 +
 +    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
 +
 +    /* Calculate erfc() in range [4.5,inf] */
 +    w       = gmx_mm_inv_pd(xabs);
 +    w2      = _mm_mul_pd(w, w);
 +
 +    PolyCP0  = _mm_mul_pd(CCP6, w2);
 +    PolyCP1  = _mm_mul_pd(CCP5, w2);
 +    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
 +    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
 +    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 +    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
 +    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
 +    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
 +    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 +    PolyCP1  = _mm_mul_pd(PolyCP1, w);
 +    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
 +    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
 +
 +    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
 +    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
 +    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
 +    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 +    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
 +    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
 +    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 +    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
 +
 +    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
 +
 +    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
 +    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
 +    res_erfcC = _mm_mul_pd(res_erfcC, w);
 +
 +    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
 +    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
 +
 +    res_erfc = _mm_mul_pd(res_erfc, expmx2);
 +
 +    /* erfc(x<0) = 2-erfc(|x|) */
 +    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
 +    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
 +
 +    /* Select erf() or erfc() */
 +    mask = _mm_cmplt_pd(xabs, one);
 +    res  = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
 +
 +    return res;
 +}
 +
 +
 +static __m128d
 +gmx_mm_erfc_pd(__m128d x)
 +{
 +    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
 +    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
 +    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
 +    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
 +    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
 +    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
 +
 +    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
 +    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
 +    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
 +    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
 +    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
 +    /* CAQ0 == 1.0 */
 +    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
 +
 +    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
 +    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
 +    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
 +    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
 +    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
 +    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
 +    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
 +    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
 +    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
 +    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
 +    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
 +    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
 +    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
 +    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
 +    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
 +    /* CBQ0 == 1.0 */
 +
 +    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
 +    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
 +    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
 +    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
 +    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
 +    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
 +    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
 +    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
 +
 +    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
 +    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
 +    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
 +    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
 +    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
 +    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
 +    /* CCQ0 == 1.0 */
 +    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
 +
 +    const __m128d one       = _mm_set1_pd(1.0);
 +    const __m128d two       = _mm_set1_pd(2.0);
 +
 +    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
 +
 +    __m128d       xabs, x2, x4, t, t2, w, w2;
 +    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
 +    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
 +    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
 +    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
 +    __m128d       mask, expmx2;
 +
 +    /* Calculate erf() */
 +    xabs     = gmx_mm_abs_pd(x);
 +    x2       = _mm_mul_pd(x, x);
 +    x4       = _mm_mul_pd(x2, x2);
 +
 +    PolyAP0  = _mm_mul_pd(CAP4, x4);
 +    PolyAP1  = _mm_mul_pd(CAP3, x4);
 +    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
 +    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
 +    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
 +    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
 +    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
 +    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
 +
 +    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
 +    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
 +    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
 +    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
 +    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
 +    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
 +    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
 +    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
 +    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
 +    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
 +
 +    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
 +    res_erf  = _mm_add_pd(CAoffset, res_erf);
 +    res_erf  = _mm_mul_pd(x, res_erf);
 +
 +    /* Calculate erfc() in range [1,4.5] */
 +    t       = _mm_sub_pd(xabs, one);
 +    t2      = _mm_mul_pd(t, t);
 +
 +    PolyBP0  = _mm_mul_pd(CBP6, t2);
 +    PolyBP1  = _mm_mul_pd(CBP5, t2);
 +    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
 +    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
 +    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 +    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
 +    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
 +    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
 +    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
 +    PolyBP1  = _mm_mul_pd(PolyBP1, t);
 +    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
 +    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
 +
 +    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
 +    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
 +    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
 +    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 +    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 +    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
 +    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
 +    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
 +    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
 +    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
 +    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
 +
 +    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
 +
 +    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
 +
 +    /* Calculate erfc() in range [4.5,inf] */
 +    w       = gmx_mm_inv_pd(xabs);
 +    w2      = _mm_mul_pd(w, w);
 +
 +    PolyCP0  = _mm_mul_pd(CCP6, w2);
 +    PolyCP1  = _mm_mul_pd(CCP5, w2);
 +    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
 +    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
 +    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 +    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
 +    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
 +    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
 +    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
 +    PolyCP1  = _mm_mul_pd(PolyCP1, w);
 +    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
 +    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
 +
 +    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
 +    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
 +    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
 +    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 +    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
 +    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
 +    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
 +    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
 +    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
 +
 +    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
 +
 +    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
 +    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
 +    res_erfcC = _mm_mul_pd(res_erfcC, w);
 +
 +    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
 +    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
 +
 +    res_erfc = _mm_mul_pd(res_erfc, expmx2);
 +
 +    /* erfc(x<0) = 2-erfc(|x|) */
 +    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
 +    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
 +
 +    /* Select erf() or erfc() */
 +    mask = _mm_cmplt_pd(xabs, one);
 +    res  = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
 +
 +    return res;
 +}
 +
 +
 +/* Calculate the force correction due to PME analytically.
 + *
 + * This routine is meant to enable analytical evaluation of the
 + * direct-space PME electrostatic force to avoid tables.
 + *
 + * The direct-space potential should be Erfc(beta*r)/r, but there
 + * are some problems evaluating that:
 + *
 + * First, the error function is difficult (read: expensive) to
 + * approxmiate accurately for intermediate to large arguments, and
 + * this happens already in ranges of beta*r that occur in simulations.
 + * Second, we now try to avoid calculating potentials in Gromacs but
 + * use forces directly.
 + *
 + * We can simply things slight by noting that the PME part is really
 + * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 + *
 + * V= 1/r - Erf(beta*r)/r
 + *
 + * The first term we already have from the inverse square root, so
 + * that we can leave out of this routine.
 + *
 + * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 + * the argument beta*r will be in the range 0.15 to ~4. Use your
 + * favorite plotting program to realize how well-behaved Erf(z)/z is
 + * in this range!
 + *
 + * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 + * However, it turns out it is more efficient to approximate f(z)/z and
 + * then only use even powers. This is another minor optimization, since
 + * we actually WANT f(z)/z, because it is going to be multiplied by
 + * the vector between the two atoms to get the vectorial force. The
 + * fastest flops are the ones we can avoid calculating!
 + *
 + * So, here's how it should be used:
 + *
 + * 1. Calculate r^2.
 + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 + * 3. Evaluate this routine with z^2 as the argument.
 + * 4. The return value is the expression:
 + *
 + *
 + *       2*exp(-z^2)     erf(z)
 + *       ------------ - --------
 + *       sqrt(Pi)*z^2      z^3
 + *
 + * 5. Multiply the entire expression by beta^3. This will get you
 + *
 + *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 + *       ------------------  - ---------------
 + *          sqrt(Pi)*z^2            z^3
 + *
 + *    or, switching back to r (z=r*beta):
 + *
 + *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 + *       ----------------------- - -----------
 + *            sqrt(Pi)*r^2            r^3
 + *
 + *
 + *    With a bit of math exercise you should be able to confirm that
 + *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 + *
 + * 6. Add the result to 1/r^3, multiply by the product of the charges,
 + *    and you have your force (divided by r). A final multiplication
 + *    with the vector connecting the two particles and you have your
 + *    vectorial force to add to the particles.
 + *
 + */
 +static __m128d
 +gmx_mm_pmecorrF_pd(__m128d z2)
 +{
 +    const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
 +    const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
 +    const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
 +    const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
 +    const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
 +    const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
 +    const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
 +    const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
 +    const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
 +    const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
 +    const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
 +
 +    const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
 +    const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
 +    const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
 +    const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
 +    const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
 +    const __m128d  FD0      = _mm_set1_pd(1.0);
 +
 +    __m128d        z4;
 +    __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
 +
 +    z4             = _mm_mul_pd(z2, z2);
 +
 +    polyFD1        = _mm_mul_pd(FD5, z4);
 +    polyFD0        = _mm_mul_pd(FD4, z4);
 +    polyFD1        = _mm_add_pd(polyFD1, FD3);
 +    polyFD0        = _mm_add_pd(polyFD0, FD2);
 +    polyFD1        = _mm_mul_pd(polyFD1, z4);
 +    polyFD0        = _mm_mul_pd(polyFD0, z4);
 +    polyFD1        = _mm_add_pd(polyFD1, FD1);
 +    polyFD0        = _mm_add_pd(polyFD0, FD0);
 +    polyFD1        = _mm_mul_pd(polyFD1, z2);
 +    polyFD0        = _mm_add_pd(polyFD0, polyFD1);
 +
 +    polyFD0        = gmx_mm_inv_pd(polyFD0);
 +
 +    polyFN0        = _mm_mul_pd(FN10, z4);
 +    polyFN1        = _mm_mul_pd(FN9, z4);
 +    polyFN0        = _mm_add_pd(polyFN0, FN8);
 +    polyFN1        = _mm_add_pd(polyFN1, FN7);
 +    polyFN0        = _mm_mul_pd(polyFN0, z4);
 +    polyFN1        = _mm_mul_pd(polyFN1, z4);
 +    polyFN0        = _mm_add_pd(polyFN0, FN6);
 +    polyFN1        = _mm_add_pd(polyFN1, FN5);
 +    polyFN0        = _mm_mul_pd(polyFN0, z4);
 +    polyFN1        = _mm_mul_pd(polyFN1, z4);
 +    polyFN0        = _mm_add_pd(polyFN0, FN4);
 +    polyFN1        = _mm_add_pd(polyFN1, FN3);
 +    polyFN0        = _mm_mul_pd(polyFN0, z4);
 +    polyFN1        = _mm_mul_pd(polyFN1, z4);
 +    polyFN0        = _mm_add_pd(polyFN0, FN2);
 +    polyFN1        = _mm_add_pd(polyFN1, FN1);
 +    polyFN0        = _mm_mul_pd(polyFN0, z4);
 +    polyFN1        = _mm_mul_pd(polyFN1, z2);
 +    polyFN0        = _mm_add_pd(polyFN0, FN0);
 +    polyFN0        = _mm_add_pd(polyFN0, polyFN1);
 +
 +    return _mm_mul_pd(polyFN0, polyFD0);
 +}
 +
 +
 +
 +
 +/* Calculate the potential correction due to PME analytically.
 + *
 + * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 + *
 + * This routine calculates Erf(z)/z, although you should provide z^2
 + * as the input argument.
 + *
 + * Here's how it should be used:
 + *
 + * 1. Calculate r^2.
 + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 + * 3. Evaluate this routine with z^2 as the argument.
 + * 4. The return value is the expression:
 + *
 + *
 + *        erf(z)
 + *       --------
 + *          z
 + *
 + * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 + *
 + *       erf(r*beta)
 + *       -----------
 + *           r
 + *
 + * 6. Subtract the result from 1/r, multiply by the product of the charges,
 + *    and you have your potential.
 + *
 + */
 +static __m128d
 +gmx_mm_pmecorrV_pd(__m128d z2)
 +{
 +    const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
 +    const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
 +    const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
 +    const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
 +    const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
 +    const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
 +    const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
 +    const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
 +    const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
 +    const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
 +
 +    const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
 +    const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
 +    const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
 +    const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
 +    const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
 +    const __m128d  VD0      = _mm_set1_pd(1.0);
 +
 +    __m128d        z4;
 +    __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
 +
 +    z4             = _mm_mul_pd(z2, z2);
 +
 +    polyVD1        = _mm_mul_pd(VD5, z4);
 +    polyVD0        = _mm_mul_pd(VD4, z4);
 +    polyVD1        = _mm_add_pd(polyVD1, VD3);
 +    polyVD0        = _mm_add_pd(polyVD0, VD2);
 +    polyVD1        = _mm_mul_pd(polyVD1, z4);
 +    polyVD0        = _mm_mul_pd(polyVD0, z4);
 +    polyVD1        = _mm_add_pd(polyVD1, VD1);
 +    polyVD0        = _mm_add_pd(polyVD0, VD0);
 +    polyVD1        = _mm_mul_pd(polyVD1, z2);
 +    polyVD0        = _mm_add_pd(polyVD0, polyVD1);
 +
 +    polyVD0        = gmx_mm_inv_pd(polyVD0);
 +
 +    polyVN1        = _mm_mul_pd(VN9, z4);
 +    polyVN0        = _mm_mul_pd(VN8, z4);
 +    polyVN1        = _mm_add_pd(polyVN1, VN7);
 +    polyVN0        = _mm_add_pd(polyVN0, VN6);
 +    polyVN1        = _mm_mul_pd(polyVN1, z4);
 +    polyVN0        = _mm_mul_pd(polyVN0, z4);
 +    polyVN1        = _mm_add_pd(polyVN1, VN5);
 +    polyVN0        = _mm_add_pd(polyVN0, VN4);
 +    polyVN1        = _mm_mul_pd(polyVN1, z4);
 +    polyVN0        = _mm_mul_pd(polyVN0, z4);
 +    polyVN1        = _mm_add_pd(polyVN1, VN3);
 +    polyVN0        = _mm_add_pd(polyVN0, VN2);
 +    polyVN1        = _mm_mul_pd(polyVN1, z4);
 +    polyVN0        = _mm_mul_pd(polyVN0, z4);
 +    polyVN1        = _mm_add_pd(polyVN1, VN1);
 +    polyVN0        = _mm_add_pd(polyVN0, VN0);
 +    polyVN1        = _mm_mul_pd(polyVN1, z2);
 +    polyVN0        = _mm_add_pd(polyVN0, polyVN1);
 +
 +    return _mm_mul_pd(polyVN0, polyVD0);
 +}
 +
 +
 +static int
 +gmx_mm_sincos_pd(__m128d  x,
 +                 __m128d *sinval,
 +                 __m128d *cosval)
 +{
 +#ifdef _MSC_VER
 +    __declspec(align(16))
 +    const double sintable[34] =
 +    {
 +        1.00000000000000000e+00, 0.00000000000000000e+00,
 +        9.95184726672196929e-01, 9.80171403295606036e-02,
 +        9.80785280403230431e-01, 1.95090322016128248e-01,
 +        9.56940335732208824e-01, 2.90284677254462331e-01,
 +        9.23879532511286738e-01, 3.82683432365089782e-01,
 +        8.81921264348355050e-01, 4.71396736825997642e-01,
 +        8.31469612302545236e-01, 5.55570233019602178e-01,
 +        7.73010453362736993e-01, 6.34393284163645488e-01,
 +        7.07106781186547573e-01, 7.07106781186547462e-01,
 +        6.34393284163645599e-01, 7.73010453362736882e-01,
 +        5.55570233019602289e-01, 8.31469612302545125e-01,
 +        4.71396736825997809e-01, 8.81921264348354939e-01,
 +        3.82683432365089837e-01, 9.23879532511286738e-01,
 +        2.90284677254462276e-01, 9.56940335732208935e-01,
 +        1.95090322016128304e-01, 9.80785280403230431e-01,
 +        9.80171403295607702e-02, 9.95184726672196818e-01,
 +        0.0, 1.00000000000000000e+00
 +    };
 +#else
 +    const __m128d sintable[17] =
 +    {
 +        _mm_set_pd( 0.0, 1.0 ),
 +        _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
 +        _mm_set_pd(  1.0, 0.0 )
 +    };
 +#endif
 +
 +    const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-     const __m128i signbit_epi32  = _mm_set1_epi32(0x80000000);
 +
 +    const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
 +    const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
 +    const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
 +    const __m128i ione          = _mm_set1_epi32(1);
 +    const __m128i i32           = _mm_set1_epi32(32);
 +    const __m128i i16           = _mm_set1_epi32(16);
 +    const __m128i tabmask       = _mm_set1_epi32(0x3F);
 +    const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
 +    const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
 +    const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
 +    const __m128d sinP1         = _mm_set1_pd(1.0);
 +
 +    const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
 +    const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
 +    const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
 +    const __m128d cosP0         = _mm_set1_pd(1.0);
 +
 +    __m128d       scalex;
 +    __m128i       tabidx, corridx;
 +    __m128d       xabs, z, z2, polySin, polyCos;
 +    __m128d       xpoint;
 +    __m128d       ypoint0, ypoint1;
 +
 +    __m128d       sinpoint, cospoint;
 +    __m128d       xsign, ssign, csign;
 +    __m128i       imask, sswapsign, cswapsign;
-     __m128d       minusone;
 +
 +    xsign    = _mm_andnot_pd(signmask, x);
 +    xabs     = _mm_and_pd(x, signmask);
 +
 +    scalex   = _mm_mul_pd(tabscale, xabs);
 +    tabidx   = _mm_cvtpd_epi32(scalex);
 +
 +    xpoint   = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
 +
 +    /* Extended precision arithmetics */
 +    z        = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
 +    z        = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
 +
 +    /* Range reduction to 0..2*Pi */
 +    tabidx   = _mm_and_si128(tabidx, tabmask);
 +
 +    /* tabidx is now in range [0,..,64] */
 +    imask     = _mm_cmpgt_epi32(tabidx, i32);
 +    sswapsign = imask;
 +    cswapsign = imask;
 +    corridx   = _mm_and_si128(imask, i32);
 +    tabidx    = _mm_sub_epi32(tabidx, corridx);
 +
 +    /* tabidx is now in range [0..32] */
 +    imask     = _mm_cmpgt_epi32(tabidx, i16);
 +    cswapsign = _mm_xor_si128(cswapsign, imask);
 +    corridx   = _mm_sub_epi32(i32, tabidx);
 +    tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
 +    /* tabidx is now in range [0..16] */
 +    ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
 +    csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
 +
 +#ifdef _MSC_VER
 +    ypoint0  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
 +    ypoint1  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
 +#else
 +    ypoint0  = sintable[_mm_extract_epi32(tabidx, 0)];
 +    ypoint1  = sintable[_mm_extract_epi32(tabidx, 1)];
 +#endif
 +    sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
 +    cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
 +
 +    sinpoint = _mm_mul_pd(sinpoint, ssign);
 +    cospoint = _mm_mul_pd(cospoint, csign);
 +
 +    z2       = _mm_mul_pd(z, z);
 +
 +    polySin  = _mm_mul_pd(sinP7, z2);
 +    polySin  = _mm_add_pd(polySin, sinP5);
 +    polySin  = _mm_mul_pd(polySin, z2);
 +    polySin  = _mm_add_pd(polySin, sinP3);
 +    polySin  = _mm_mul_pd(polySin, z2);
 +    polySin  = _mm_add_pd(polySin, sinP1);
 +    polySin  = _mm_mul_pd(polySin, z);
 +
 +    polyCos  = _mm_mul_pd(cosP6, z2);
 +    polyCos  = _mm_add_pd(polyCos, cosP4);
 +    polyCos  = _mm_mul_pd(polyCos, z2);
 +    polyCos  = _mm_add_pd(polyCos, cosP2);
 +    polyCos  = _mm_mul_pd(polyCos, z2);
 +    polyCos  = _mm_add_pd(polyCos, cosP0);
 +
 +    *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
 +    *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
 +
 +    return 0;
 +}
 +
 +/*
 + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 + * will then call the sincos() routine and waste a factor 2 in performance!
 + */
 +static __m128d
 +gmx_mm_sin_pd(__m128d x)
 +{
 +    __m128d s, c;
 +    gmx_mm_sincos_pd(x, &s, &c);
 +    return s;
 +}
 +
 +/*
 + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 + * will then call the sincos() routine and waste a factor 2 in performance!
 + */
 +static __m128d
 +gmx_mm_cos_pd(__m128d x)
 +{
 +    __m128d s, c;
 +    gmx_mm_sincos_pd(x, &s, &c);
 +    return c;
 +}
 +
 +
 +
 +static __m128d
 +gmx_mm_tan_pd(__m128d x)
 +{
 +    __m128d sinval, cosval;
 +    __m128d tanval;
 +
 +    gmx_mm_sincos_pd(x, &sinval, &cosval);
 +
 +    tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
 +
 +    return tanval;
 +}
 +
 +
 +
 +static __m128d
 +gmx_mm_asin_pd(__m128d x)
 +{
 +    /* Same algorithm as cephes library */
 +    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 +    const __m128d limit1    = _mm_set1_pd(0.625);
 +    const __m128d limit2    = _mm_set1_pd(1e-8);
 +    const __m128d one       = _mm_set1_pd(1.0);
-     const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
 +    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
 +    const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
 +
 +    const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
 +    const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
 +    const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
 +    const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
 +    const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
 +    const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
 +
 +    const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
 +    const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
 +    const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
 +    const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
 +    const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
 +
 +    const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
 +    const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
 +    const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
 +    const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
 +    const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
 +
 +    const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
 +    const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
 +    const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
 +    const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
 +
 +    __m128d       sign;
 +    __m128d       mask;
 +    __m128d       xabs;
-     __m128d       zz, ww, z, q, w, y, zz2, ww2;
++    __m128d       zz, ww, z, q, w, zz2, ww2;
 +    __m128d       PA, PB;
 +    __m128d       QA, QB;
 +    __m128d       RA, RB;
 +    __m128d       SA, SB;
 +    __m128d       nom, denom;
 +
 +    sign  = _mm_andnot_pd(signmask, x);
 +    xabs  = _mm_and_pd(x, signmask);
 +
 +    mask  = _mm_cmpgt_pd(xabs, limit1);
 +
 +    zz    = _mm_sub_pd(one, xabs);
 +    ww    = _mm_mul_pd(xabs, xabs);
 +    zz2   = _mm_mul_pd(zz, zz);
 +    ww2   = _mm_mul_pd(ww, ww);
 +
 +    /* R */
 +    RA    = _mm_mul_pd(R4, zz2);
 +    RB    = _mm_mul_pd(R3, zz2);
 +    RA    = _mm_add_pd(RA, R2);
 +    RB    = _mm_add_pd(RB, R1);
 +    RA    = _mm_mul_pd(RA, zz2);
 +    RB    = _mm_mul_pd(RB, zz);
 +    RA    = _mm_add_pd(RA, R0);
 +    RA    = _mm_add_pd(RA, RB);
 +
 +    /* S, SA = zz2 */
 +    SB    = _mm_mul_pd(S3, zz2);
 +    SA    = _mm_add_pd(zz2, S2);
 +    SB    = _mm_add_pd(SB, S1);
 +    SA    = _mm_mul_pd(SA, zz2);
 +    SB    = _mm_mul_pd(SB, zz);
 +    SA    = _mm_add_pd(SA, S0);
 +    SA    = _mm_add_pd(SA, SB);
 +
 +    /* P */
 +    PA    = _mm_mul_pd(P5, ww2);
 +    PB    = _mm_mul_pd(P4, ww2);
 +    PA    = _mm_add_pd(PA, P3);
 +    PB    = _mm_add_pd(PB, P2);
 +    PA    = _mm_mul_pd(PA, ww2);
 +    PB    = _mm_mul_pd(PB, ww2);
 +    PA    = _mm_add_pd(PA, P1);
 +    PB    = _mm_add_pd(PB, P0);
 +    PA    = _mm_mul_pd(PA, ww);
 +    PA    = _mm_add_pd(PA, PB);
 +
 +    /* Q, QA = ww2 */
 +    QB    = _mm_mul_pd(Q4, ww2);
 +    QA    = _mm_add_pd(ww2, Q3);
 +    QB    = _mm_add_pd(QB, Q2);
 +    QA    = _mm_mul_pd(QA, ww2);
 +    QB    = _mm_mul_pd(QB, ww2);
 +    QA    = _mm_add_pd(QA, Q1);
 +    QB    = _mm_add_pd(QB, Q0);
 +    QA    = _mm_mul_pd(QA, ww);
 +    QA    = _mm_add_pd(QA, QB);
 +
 +    RA    = _mm_mul_pd(RA, zz);
 +    PA    = _mm_mul_pd(PA, ww);
 +
 +    nom   = _mm_blendv_pd( PA, RA, mask );
 +    denom = _mm_blendv_pd( QA, SA, mask );
 +
 +    q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
 +
 +    zz    = _mm_add_pd(zz, zz);
 +    zz    = gmx_mm_sqrt_pd(zz);
 +    z     = _mm_sub_pd(quarterpi, zz);
 +    zz    = _mm_mul_pd(zz, q);
 +    zz    = _mm_sub_pd(zz, morebits);
 +    z     = _mm_sub_pd(z, zz);
 +    z     = _mm_add_pd(z, quarterpi);
 +
 +    w     = _mm_mul_pd(xabs, q);
 +    w     = _mm_add_pd(w, xabs);
 +
 +    z     = _mm_blendv_pd( w, z, mask );
 +
 +    mask  = _mm_cmpgt_pd(xabs, limit2);
 +    z     = _mm_blendv_pd( xabs, z, mask );
 +
 +    z = _mm_xor_pd(z, sign);
 +
 +    return z;
 +}
 +
 +
 +static __m128d
 +gmx_mm_acos_pd(__m128d x)
 +{
-     const __m128d signmask   = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 +    const __m128d one        = _mm_set1_pd(1.0);
 +    const __m128d half       = _mm_set1_pd(0.5);
-     const __m128d pi         = _mm_set1_pd(M_PI);
 +    const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
 +    const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
 +
 +
 +    __m128d mask1;
 +
 +    __m128d z, z1, z2;
 +
 +    mask1 = _mm_cmpgt_pd(x, half);
 +    z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
 +    z1    = gmx_mm_sqrt_pd(z1);
 +    z     = _mm_blendv_pd( x, z1, mask1 );
 +
 +    z     = gmx_mm_asin_pd(z);
 +
 +    z1    = _mm_add_pd(z, z);
 +
 +    z2    = _mm_sub_pd(quarterpi0, z);
 +    z2    = _mm_add_pd(z2, quarterpi1);
 +    z2    = _mm_add_pd(z2, quarterpi0);
 +
 +    z     = _mm_blendv_pd(z2, z1, mask1);
 +
 +    return z;
 +}
 +
 +static __m128d
 +gmx_mm_atan_pd(__m128d x)
 +{
 +    /* Same algorithm as cephes library */
 +    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
 +    const __m128d limit1    = _mm_set1_pd(0.66);
 +    const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
 +    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
 +    const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
 +    const __m128d mone      = _mm_set1_pd(-1.0);
 +    const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
 +    const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
 +
 +    const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
 +    const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
 +    const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
 +    const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
 +    const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
 +
 +    const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
 +    const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
 +    const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
 +    const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
 +    const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
 +
 +    __m128d       sign;
 +    __m128d       mask1, mask2;
 +    __m128d       y, t1, t2;
 +    __m128d       z, z2;
 +    __m128d       P_A, P_B, Q_A, Q_B;
 +
 +    sign   = _mm_andnot_pd(signmask, x);
 +    x      = _mm_and_pd(x, signmask);
 +
 +    mask1  = _mm_cmpgt_pd(x, limit1);
 +    mask2  = _mm_cmpgt_pd(x, limit2);
 +
 +    t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
 +    t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
 +
 +    y      = _mm_and_pd(mask1, quarterpi);
 +    y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
 +
 +    x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
 +    x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
 +
 +    z      = _mm_mul_pd(x, x);
 +    z2     = _mm_mul_pd(z, z);
 +
 +    P_A    = _mm_mul_pd(P4, z2);
 +    P_B    = _mm_mul_pd(P3, z2);
 +    P_A    = _mm_add_pd(P_A, P2);
 +    P_B    = _mm_add_pd(P_B, P1);
 +    P_A    = _mm_mul_pd(P_A, z2);
 +    P_B    = _mm_mul_pd(P_B, z);
 +    P_A    = _mm_add_pd(P_A, P0);
 +    P_A    = _mm_add_pd(P_A, P_B);
 +
 +    /* Q_A = z2 */
 +    Q_B    = _mm_mul_pd(Q4, z2);
 +    Q_A    = _mm_add_pd(z2, Q3);
 +    Q_B    = _mm_add_pd(Q_B, Q2);
 +    Q_A    = _mm_mul_pd(Q_A, z2);
 +    Q_B    = _mm_mul_pd(Q_B, z2);
 +    Q_A    = _mm_add_pd(Q_A, Q1);
 +    Q_B    = _mm_add_pd(Q_B, Q0);
 +    Q_A    = _mm_mul_pd(Q_A, z);
 +    Q_A    = _mm_add_pd(Q_A, Q_B);
 +
 +    z      = _mm_mul_pd(z, P_A);
 +    z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
 +    z      = _mm_mul_pd(z, x);
 +    z      = _mm_add_pd(z, x);
 +
 +    t1     = _mm_and_pd(mask1, morebits1);
 +    t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
 +
 +    z      = _mm_add_pd(z, t1);
 +    y      = _mm_add_pd(y, z);
 +
 +    y      = _mm_xor_pd(y, sign);
 +
 +    return y;
 +}
 +
 +
 +static __m128d
 +gmx_mm_atan2_pd(__m128d y, __m128d x)
 +{
 +    const __m128d pi          = _mm_set1_pd(M_PI);
 +    const __m128d minuspi     = _mm_set1_pd(-M_PI);
 +    const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
 +    const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
 +
 +    __m128d       z, z1, z3, z4;
 +    __m128d       w;
 +    __m128d       maskx_lt, maskx_eq;
 +    __m128d       masky_lt, masky_eq;
 +    __m128d       mask1, mask2, mask3, mask4, maskall;
 +
 +    maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
 +    masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
 +    maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
 +    masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
 +
 +    z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
 +    z         = gmx_mm_atan_pd(z);
 +
 +    mask1     = _mm_and_pd(maskx_eq, masky_lt);
 +    mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
 +    mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
 +    mask4     = _mm_and_pd(masky_eq, maskx_lt);
 +
 +    maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
 +
 +    z         = _mm_andnot_pd(maskall, z);
 +    z1        = _mm_and_pd(mask1, minushalfpi);
 +    z3        = _mm_and_pd(mask3, halfpi);
 +    z4        = _mm_and_pd(mask4, pi);
 +
 +    z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
 +
 +    w         = _mm_blendv_pd(pi, minuspi, masky_lt);
 +    w         = _mm_and_pd(w, maskx_lt);
 +
 +    w         = _mm_andnot_pd(maskall, w);
 +
 +    z         = _mm_add_pd(z, w);
 +
 +    return z;
 +}
 +
 +#endif /*_gmx_math_x86_sse4_1_double_h_ */
diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h
index 1d0018d2b5,0000000000..ae0659047f
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h
@@@ -1,1165 -1,0 +1,1165 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifndef _gmx_math_x86_sse4_1_single_h_
 +#define _gmx_math_x86_sse4_1_single_h_
 +
 +#include <stdio.h>
 +#include <math.h>
 +
 +#include "gmx_x86_sse4_1.h"
 +
 +
 +
 +#ifndef M_PI
 +#  define M_PI 3.14159265358979323846264338327950288
 +#endif
 +
 +
 +
 +
 +/************************
 + *                      *
 + * Simple math routines *
 + *                      *
 + ************************/
 +
 +/* 1.0/sqrt(x) */
 +static gmx_inline __m128
 +gmx_mm_invsqrt_ps(__m128 x)
 +{
 +    const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
 +    const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
 +
 +    __m128       lu = _mm_rsqrt_ps(x);
 +
 +    return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
 +}
 +
 +/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
 +static gmx_inline __m128
 +gmx_mm_sqrt_ps(__m128 x)
 +{
 +    __m128 mask;
 +    __m128 res;
 +
 +    mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
 +    res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
 +
 +    res  = _mm_mul_ps(x, res);
 +
 +    return res;
 +}
 +
 +/* 1.0/x */
 +static gmx_inline __m128
 +gmx_mm_inv_ps(__m128 x)
 +{
 +    const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
 +
 +    __m128       lu = _mm_rcp_ps(x);
 +
 +    return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
 +}
 +
 +static gmx_inline __m128
 +gmx_mm_abs_ps(__m128 x)
 +{
 +    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 +
 +    return _mm_and_ps(x, signmask);
 +}
 +
 +
 +
 +static __m128
 +gmx_mm_log_ps(__m128 x)
 +{
 +    /* Same algorithm as cephes library */
 +    const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 +    const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
 +    const __m128  half       = _mm_set1_ps(0.5f);
 +    const __m128  one        = _mm_set1_ps(1.0f);
 +    const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
 +    const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
 +    const __m128  corr2      = _mm_set1_ps(0.693359375f);
 +
 +    const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
 +    const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
 +    const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
 +    const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
 +    const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
 +    const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
 +    const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
 +    const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
 +    const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
 +
-     __m128        fexp, fexp1;
++    __m128        fexp;
 +    __m128i       iexp;
 +    __m128        mask;
-     __m128        x1, x2;
++    __m128        x2;
 +    __m128        y;
 +    __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
 +
 +    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
 +    fexp  = _mm_and_ps(x, expmask);
 +    iexp  = gmx_mm_castps_si128(fexp);
 +    iexp  = _mm_srli_epi32(iexp, 23);
 +    iexp  = _mm_sub_epi32(iexp, expbase_m1);
 +
 +    x     = _mm_andnot_ps(expmask, x);
 +    x     = _mm_or_ps(x, one);
 +    x     = _mm_mul_ps(x, half);
 +
 +    mask  = _mm_cmplt_ps(x, invsq2);
 +
 +    x     = _mm_add_ps(x, _mm_and_ps(mask, x));
 +    x     = _mm_sub_ps(x, one);
 +    iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
 +
 +    x2    = _mm_mul_ps(x, x);
 +
 +    pA    = _mm_mul_ps(CA_1, x);
 +    pB    = _mm_mul_ps(CB_1, x);
 +    pC    = _mm_mul_ps(CC_1, x);
 +    pD    = _mm_mul_ps(CD_1, x);
 +    pE    = _mm_mul_ps(CE_1, x);
 +    tB    = _mm_add_ps(CB_0, x2);
 +    tC    = _mm_add_ps(CC_0, x2);
 +    tD    = _mm_add_ps(CD_0, x2);
 +    tE    = _mm_add_ps(CE_0, x2);
 +    pB    = _mm_add_ps(pB, tB);
 +    pC    = _mm_add_ps(pC, tC);
 +    pD    = _mm_add_ps(pD, tD);
 +    pE    = _mm_add_ps(pE, tE);
 +
 +    pA    = _mm_mul_ps(pA, pB);
 +    pC    = _mm_mul_ps(pC, pD);
 +    pE    = _mm_mul_ps(pE, x2);
 +    pA    = _mm_mul_ps(pA, pC);
 +    y     = _mm_mul_ps(pA, pE);
 +
 +    fexp  = _mm_cvtepi32_ps(iexp);
 +    y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
 +
 +    y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
 +    x2    = _mm_add_ps(x, y);
 +
 +    x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
 +
 +    return x2;
 +}
 +
 +
 +/*
 + * 2^x function.
 + *
 + * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
 + * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
 + *
 + * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
 + *
 + * The largest-magnitude exponent we can represent in IEEE single-precision binary format
 + * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
 + * result to zero if the argument falls outside this range. For small numbers this is just fine, but
 + * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
 + * number instead. That would take a few extra cycles and not really help, since something is
 + * wrong if you are using single precision to work with numbers that cannot really be represented
 + * in single precision.
 + *
 + * The accuracy is at least 23 bits.
 + */
 +static __m128
 +gmx_mm_exp2_ps(__m128 x)
 +{
 +    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
 +    const __m128  arglimit = _mm_set1_ps(126.0f);
 +
 +    const __m128i expbase  = _mm_set1_epi32(127);
 +    const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
 +    const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
 +    const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
 +    const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
 +    const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
 +    const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
 +    const __m128  CA0      = _mm_set1_ps(1.0f);
 +
 +    __m128        valuemask;
 +    __m128i       iexppart;
 +    __m128        fexppart;
 +    __m128        intpart;
 +    __m128        x2;
 +    __m128        p0, p1;
 +
 +    iexppart  = _mm_cvtps_epi32(x);
 +    intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
 +    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 +    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
 +    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 +
 +    x         = _mm_sub_ps(x, intpart);
 +    x2        = _mm_mul_ps(x, x);
 +
 +    p0        = _mm_mul_ps(CA6, x2);
 +    p1        = _mm_mul_ps(CA5, x2);
 +    p0        = _mm_add_ps(p0, CA4);
 +    p1        = _mm_add_ps(p1, CA3);
 +    p0        = _mm_mul_ps(p0, x2);
 +    p1        = _mm_mul_ps(p1, x2);
 +    p0        = _mm_add_ps(p0, CA2);
 +    p1        = _mm_add_ps(p1, CA1);
 +    p0        = _mm_mul_ps(p0, x2);
 +    p1        = _mm_mul_ps(p1, x);
 +    p0        = _mm_add_ps(p0, CA0);
 +    p0        = _mm_add_ps(p0, p1);
 +    x         = _mm_mul_ps(p0, fexppart);
 +
 +    return x;
 +}
 +
 +
 +/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
 + * but there will then be a small rounding error since we lose some precision due to the
 + * multiplication. This will then be magnified a lot by the exponential.
 + *
 + * Instead, we calculate the fractional part directly as a minimax approximation of
 + * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
 + * remaining after 2^y, which avoids the precision-loss.
 + * The final result is correct to within 1 LSB over the entire argument range.
 + */
 +static __m128
 +gmx_mm_exp_ps(__m128 x)
 +{
 +    const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
 +    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
 +    const __m128  arglimit      = _mm_set1_ps(126.0f);
 +    const __m128i expbase       = _mm_set1_epi32(127);
 +
 +    const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
 +    const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
 +
 +    const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
 +    const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
 +    const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
 +    const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
 +    const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
 +    const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
 +    const __m128  one           = _mm_set1_ps(1.0f);
 +
 +    __m128        y, x2;
 +    __m128        p0, p1;
 +    __m128        valuemask;
 +    __m128i       iexppart;
 +    __m128        fexppart;
 +    __m128        intpart;
 +
 +    y = _mm_mul_ps(x, argscale);
 +
 +    iexppart  = _mm_cvtps_epi32(y);
 +    intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
 +
 +    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
 +    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
 +    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
 +
 +    /* Extended precision arithmetics */
 +    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
 +    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
 +
 +    x2        = _mm_mul_ps(x, x);
 +
 +    p1        = _mm_mul_ps(CC5, x2);
 +    p0        = _mm_mul_ps(CC4, x2);
 +    p1        = _mm_add_ps(p1, CC3);
 +    p0        = _mm_add_ps(p0, CC2);
 +    p1        = _mm_mul_ps(p1, x2);
 +    p0        = _mm_mul_ps(p0, x2);
 +    p1        = _mm_add_ps(p1, CC1);
 +    p0        = _mm_add_ps(p0, CC0);
 +    p1        = _mm_mul_ps(p1, x);
 +    p0        = _mm_add_ps(p0, p1);
 +    p0        = _mm_mul_ps(p0, x2);
 +    x         = _mm_add_ps(x, one);
 +    x         = _mm_add_ps(x, p0);
 +
 +    x         = _mm_mul_ps(x, fexppart);
 +
 +    return x;
 +}
 +
 +/* FULL precision. Only errors in LSB */
 +static __m128
 +gmx_mm_erf_ps(__m128 x)
 +{
 +    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 +    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 +    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 +    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 +    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 +    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 +    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 +    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 +    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 +    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 +    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 +    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 +    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 +    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 +    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 +    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 +    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 +    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 +    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 +    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 +    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 +    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 +    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 +    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 +    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 +    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 +    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 +    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 +    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 +    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 +    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 +
 +    /* Coefficients for expansion of exp(x) in [0,0.1] */
 +    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 +    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 +    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 +    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 +
 +    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 +    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    const __m128  one      = _mm_set1_ps(1.0f);
 +    const __m128  two      = _mm_set1_ps(2.0f);
 +
 +    __m128        x2, x4, y;
 +    __m128        z, q, t, t2, w, w2;
 +    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 +    __m128        expmx2, corr;
 +    __m128        res_erf, res_erfc, res;
 +    __m128        mask;
 +
 +    /* Calculate erf() */
 +    x2     = _mm_mul_ps(x, x);
 +    x4     = _mm_mul_ps(x2, x2);
 +
 +    pA0  = _mm_mul_ps(CA6, x4);
 +    pA1  = _mm_mul_ps(CA5, x4);
 +    pA0  = _mm_add_ps(pA0, CA4);
 +    pA1  = _mm_add_ps(pA1, CA3);
 +    pA0  = _mm_mul_ps(pA0, x4);
 +    pA1  = _mm_mul_ps(pA1, x4);
 +    pA0  = _mm_add_ps(pA0, CA2);
 +    pA1  = _mm_add_ps(pA1, CA1);
 +    pA0  = _mm_mul_ps(pA0, x4);
 +    pA1  = _mm_mul_ps(pA1, x2);
 +    pA0  = _mm_add_ps(pA0, pA1);
 +    pA0  = _mm_add_ps(pA0, CA0);
 +
 +    res_erf = _mm_mul_ps(x, pA0);
 +
 +    /* Calculate erfc */
 +
 +    y       = gmx_mm_abs_ps(x);
 +    t       = gmx_mm_inv_ps(y);
 +    w       = _mm_sub_ps(t, one);
 +    t2      = _mm_mul_ps(t, t);
 +    w2      = _mm_mul_ps(w, w);
 +    /*
 +     * We cannot simply calculate exp(-x2) directly in single precision, since
 +     * that will lose a couple of bits of precision due to the multiplication.
 +     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 +     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 +     *
 +     * The only drawback with this is that it requires TWO separate exponential
 +     * evaluations, which would be horrible performance-wise. However, the argument
 +     * for the second exp() call is always small, so there we simply use a
 +     * low-order minimax expansion on [0,0.1].
 +     */
 +
 +    z       = _mm_and_ps(y, sieve);
 +    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 +
 +    corr    = _mm_mul_ps(CD4, q);
 +    corr    = _mm_add_ps(corr, CD3);
 +    corr    = _mm_mul_ps(corr, q);
 +    corr    = _mm_add_ps(corr, CD2);
 +    corr    = _mm_mul_ps(corr, q);
 +    corr    = _mm_add_ps(corr, one);
 +    corr    = _mm_mul_ps(corr, q);
 +    corr    = _mm_add_ps(corr, one);
 +
 +    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 +    expmx2  = _mm_mul_ps(expmx2, corr);
 +
 +    pB1  = _mm_mul_ps(CB9, w2);
 +    pB0  = _mm_mul_ps(CB8, w2);
 +    pB1  = _mm_add_ps(pB1, CB7);
 +    pB0  = _mm_add_ps(pB0, CB6);
 +    pB1  = _mm_mul_ps(pB1, w2);
 +    pB0  = _mm_mul_ps(pB0, w2);
 +    pB1  = _mm_add_ps(pB1, CB5);
 +    pB0  = _mm_add_ps(pB0, CB4);
 +    pB1  = _mm_mul_ps(pB1, w2);
 +    pB0  = _mm_mul_ps(pB0, w2);
 +    pB1  = _mm_add_ps(pB1, CB3);
 +    pB0  = _mm_add_ps(pB0, CB2);
 +    pB1  = _mm_mul_ps(pB1, w2);
 +    pB0  = _mm_mul_ps(pB0, w2);
 +    pB1  = _mm_add_ps(pB1, CB1);
 +    pB1  = _mm_mul_ps(pB1, w);
 +    pB0  = _mm_add_ps(pB0, pB1);
 +    pB0  = _mm_add_ps(pB0, CB0);
 +
 +    pC0  = _mm_mul_ps(CC10, t2);
 +    pC1  = _mm_mul_ps(CC9, t2);
 +    pC0  = _mm_add_ps(pC0, CC8);
 +    pC1  = _mm_add_ps(pC1, CC7);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t2);
 +    pC0  = _mm_add_ps(pC0, CC6);
 +    pC1  = _mm_add_ps(pC1, CC5);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t2);
 +    pC0  = _mm_add_ps(pC0, CC4);
 +    pC1  = _mm_add_ps(pC1, CC3);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t2);
 +    pC0  = _mm_add_ps(pC0, CC2);
 +    pC1  = _mm_add_ps(pC1, CC1);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t);
 +    pC0  = _mm_add_ps(pC0, pC1);
 +    pC0  = _mm_add_ps(pC0, CC0);
 +    pC0  = _mm_mul_ps(pC0, t);
 +
 +    /* SELECT pB0 or pC0 for erfc() */
 +    mask     = _mm_cmplt_ps(two, y);
 +    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
 +    res_erfc = _mm_mul_ps(res_erfc, expmx2);
 +
 +    /* erfc(x<0) = 2-erfc(|x|) */
 +    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 +    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
 +
 +    /* Select erf() or erfc() */
 +    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 +    res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
 +
 +    return res;
 +}
 +
 +
 +/* FULL precision. Only errors in LSB */
 +static __m128
 +gmx_mm_erfc_ps(__m128 x)
 +{
 +    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
 +    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
 +    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
 +    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
 +    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
 +    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
 +    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
 +    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
 +    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
 +    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
 +    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
 +    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
 +    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
 +    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
 +    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
 +    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
 +    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
 +    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
 +    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
 +    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
 +    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
 +    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
 +    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
 +    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
 +    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
 +    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
 +    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
 +    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
 +    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
 +    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
 +    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
 +
 +    /* Coefficients for expansion of exp(x) in [0,0.1] */
 +    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
 +    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
 +    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
 +    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
 +
 +    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
 +    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +    const __m128  one      = _mm_set1_ps(1.0f);
 +    const __m128  two      = _mm_set1_ps(2.0f);
 +
 +    __m128        x2, x4, y;
 +    __m128        z, q, t, t2, w, w2;
 +    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
 +    __m128        expmx2, corr;
 +    __m128        res_erf, res_erfc, res;
 +    __m128        mask;
 +
 +    /* Calculate erf() */
 +    x2     = _mm_mul_ps(x, x);
 +    x4     = _mm_mul_ps(x2, x2);
 +
 +    pA0  = _mm_mul_ps(CA6, x4);
 +    pA1  = _mm_mul_ps(CA5, x4);
 +    pA0  = _mm_add_ps(pA0, CA4);
 +    pA1  = _mm_add_ps(pA1, CA3);
 +    pA0  = _mm_mul_ps(pA0, x4);
 +    pA1  = _mm_mul_ps(pA1, x4);
 +    pA0  = _mm_add_ps(pA0, CA2);
 +    pA1  = _mm_add_ps(pA1, CA1);
 +    pA0  = _mm_mul_ps(pA0, x4);
 +    pA1  = _mm_mul_ps(pA1, x2);
 +    pA0  = _mm_add_ps(pA0, pA1);
 +    pA0  = _mm_add_ps(pA0, CA0);
 +
 +    res_erf = _mm_mul_ps(x, pA0);
 +
 +    /* Calculate erfc */
 +    y       = gmx_mm_abs_ps(x);
 +    t       = gmx_mm_inv_ps(y);
 +    w       = _mm_sub_ps(t, one);
 +    t2      = _mm_mul_ps(t, t);
 +    w2      = _mm_mul_ps(w, w);
 +    /*
 +     * We cannot simply calculate exp(-x2) directly in single precision, since
 +     * that will lose a couple of bits of precision due to the multiplication.
 +     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
 +     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
 +     *
 +     * The only drawback with this is that it requires TWO separate exponential
 +     * evaluations, which would be horrible performance-wise. However, the argument
 +     * for the second exp() call is always small, so there we simply use a
 +     * low-order minimax expansion on [0,0.1].
 +     */
 +
 +    z       = _mm_and_ps(y, sieve);
 +    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
 +
 +    corr    = _mm_mul_ps(CD4, q);
 +    corr    = _mm_add_ps(corr, CD3);
 +    corr    = _mm_mul_ps(corr, q);
 +    corr    = _mm_add_ps(corr, CD2);
 +    corr    = _mm_mul_ps(corr, q);
 +    corr    = _mm_add_ps(corr, one);
 +    corr    = _mm_mul_ps(corr, q);
 +    corr    = _mm_add_ps(corr, one);
 +
 +    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
 +    expmx2  = _mm_mul_ps(expmx2, corr);
 +
 +    pB1  = _mm_mul_ps(CB9, w2);
 +    pB0  = _mm_mul_ps(CB8, w2);
 +    pB1  = _mm_add_ps(pB1, CB7);
 +    pB0  = _mm_add_ps(pB0, CB6);
 +    pB1  = _mm_mul_ps(pB1, w2);
 +    pB0  = _mm_mul_ps(pB0, w2);
 +    pB1  = _mm_add_ps(pB1, CB5);
 +    pB0  = _mm_add_ps(pB0, CB4);
 +    pB1  = _mm_mul_ps(pB1, w2);
 +    pB0  = _mm_mul_ps(pB0, w2);
 +    pB1  = _mm_add_ps(pB1, CB3);
 +    pB0  = _mm_add_ps(pB0, CB2);
 +    pB1  = _mm_mul_ps(pB1, w2);
 +    pB0  = _mm_mul_ps(pB0, w2);
 +    pB1  = _mm_add_ps(pB1, CB1);
 +    pB1  = _mm_mul_ps(pB1, w);
 +    pB0  = _mm_add_ps(pB0, pB1);
 +    pB0  = _mm_add_ps(pB0, CB0);
 +
 +    pC0  = _mm_mul_ps(CC10, t2);
 +    pC1  = _mm_mul_ps(CC9, t2);
 +    pC0  = _mm_add_ps(pC0, CC8);
 +    pC1  = _mm_add_ps(pC1, CC7);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t2);
 +    pC0  = _mm_add_ps(pC0, CC6);
 +    pC1  = _mm_add_ps(pC1, CC5);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t2);
 +    pC0  = _mm_add_ps(pC0, CC4);
 +    pC1  = _mm_add_ps(pC1, CC3);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t2);
 +    pC0  = _mm_add_ps(pC0, CC2);
 +    pC1  = _mm_add_ps(pC1, CC1);
 +    pC0  = _mm_mul_ps(pC0, t2);
 +    pC1  = _mm_mul_ps(pC1, t);
 +    pC0  = _mm_add_ps(pC0, pC1);
 +    pC0  = _mm_add_ps(pC0, CC0);
 +    pC0  = _mm_mul_ps(pC0, t);
 +
 +    /* SELECT pB0 or pC0 for erfc() */
 +    mask     = _mm_cmplt_ps(two, y);
 +    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
 +    res_erfc = _mm_mul_ps(res_erfc, expmx2);
 +
 +    /* erfc(x<0) = 2-erfc(|x|) */
 +    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
 +    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
 +
 +    /* Select erf() or erfc() */
 +    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
 +    res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
 +
 +    return res;
 +}
 +
 +
 +/* Calculate the force correction due to PME analytically.
 + *
 + * This routine is meant to enable analytical evaluation of the
 + * direct-space PME electrostatic force to avoid tables.
 + *
 + * The direct-space potential should be Erfc(beta*r)/r, but there
 + * are some problems evaluating that:
 + *
 + * First, the error function is difficult (read: expensive) to
 + * approxmiate accurately for intermediate to large arguments, and
 + * this happens already in ranges of beta*r that occur in simulations.
 + * Second, we now try to avoid calculating potentials in Gromacs but
 + * use forces directly.
 + *
 + * We can simply things slight by noting that the PME part is really
 + * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
 + *
 + * V= 1/r - Erf(beta*r)/r
 + *
 + * The first term we already have from the inverse square root, so
 + * that we can leave out of this routine.
 + *
 + * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 + * the argument beta*r will be in the range 0.15 to ~4. Use your
 + * favorite plotting program to realize how well-behaved Erf(z)/z is
 + * in this range!
 + *
 + * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 + * However, it turns out it is more efficient to approximate f(z)/z and
 + * then only use even powers. This is another minor optimization, since
 + * we actually WANT f(z)/z, because it is going to be multiplied by
 + * the vector between the two atoms to get the vectorial force. The
 + * fastest flops are the ones we can avoid calculating!
 + *
 + * So, here's how it should be used:
 + *
 + * 1. Calculate r^2.
 + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 + * 3. Evaluate this routine with z^2 as the argument.
 + * 4. The return value is the expression:
 + *
 + *
 + *       2*exp(-z^2)     erf(z)
 + *       ------------ - --------
 + *       sqrt(Pi)*z^2      z^3
 + *
 + * 5. Multiply the entire expression by beta^3. This will get you
 + *
 + *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 + *       ------------------  - ---------------
 + *          sqrt(Pi)*z^2            z^3
 + *
 + *    or, switching back to r (z=r*beta):
 + *
 + *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 + *       ----------------------- - -----------
 + *            sqrt(Pi)*r^2            r^3
 + *
 + *
 + *    With a bit of math exercise you should be able to confirm that
 + *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 + *
 + * 6. Add the result to 1/r^3, multiply by the product of the charges,
 + *    and you have your force (divided by r). A final multiplication
 + *    with the vector connecting the two particles and you have your
 + *    vectorial force to add to the particles.
 + *
 + */
 +static gmx_inline __m128
 +gmx_mm_pmecorrF_ps(__m128 z2)
 +{
 +    const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
 +    const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
 +    const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
 +    const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
 +    const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
 +    const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
 +    const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
 +
 +    const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
 +    const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
 +    const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
 +    const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
 +    const __m128  FD0      = _mm_set1_ps(1.0f);
 +
 +    __m128        z4;
 +    __m128        polyFN0, polyFN1, polyFD0, polyFD1;
 +
 +    z4             = _mm_mul_ps(z2, z2);
 +
 +    polyFD0        = _mm_mul_ps(FD4, z4);
 +    polyFD1        = _mm_mul_ps(FD3, z4);
 +    polyFD0        = _mm_add_ps(polyFD0, FD2);
 +    polyFD1        = _mm_add_ps(polyFD1, FD1);
 +    polyFD0        = _mm_mul_ps(polyFD0, z4);
 +    polyFD1        = _mm_mul_ps(polyFD1, z2);
 +    polyFD0        = _mm_add_ps(polyFD0, FD0);
 +    polyFD0        = _mm_add_ps(polyFD0, polyFD1);
 +
 +    polyFD0        = gmx_mm_inv_ps(polyFD0);
 +
 +    polyFN0        = _mm_mul_ps(FN6, z4);
 +    polyFN1        = _mm_mul_ps(FN5, z4);
 +    polyFN0        = _mm_add_ps(polyFN0, FN4);
 +    polyFN1        = _mm_add_ps(polyFN1, FN3);
 +    polyFN0        = _mm_mul_ps(polyFN0, z4);
 +    polyFN1        = _mm_mul_ps(polyFN1, z4);
 +    polyFN0        = _mm_add_ps(polyFN0, FN2);
 +    polyFN1        = _mm_add_ps(polyFN1, FN1);
 +    polyFN0        = _mm_mul_ps(polyFN0, z4);
 +    polyFN1        = _mm_mul_ps(polyFN1, z2);
 +    polyFN0        = _mm_add_ps(polyFN0, FN0);
 +    polyFN0        = _mm_add_ps(polyFN0, polyFN1);
 +
 +    return _mm_mul_ps(polyFN0, polyFD0);
 +}
 +
 +
 +/* Calculate the potential correction due to PME analytically.
 + *
 + * See gmx_mm256_pmecorrF_ps() for details about the approximation.
 + *
 + * This routine calculates Erf(z)/z, although you should provide z^2
 + * as the input argument.
 + *
 + * Here's how it should be used:
 + *
 + * 1. Calculate r^2.
 + * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 + * 3. Evaluate this routine with z^2 as the argument.
 + * 4. The return value is the expression:
 + *
 + *
 + *        erf(z)
 + *       --------
 + *          z
 + *
 + * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 + *
 + *       erf(r*beta)
 + *       -----------
 + *           r
 + *
 + * 6. Subtract the result from 1/r, multiply by the product of the charges,
 + *    and you have your potential.
 + */
 +static gmx_inline __m128
 +gmx_mm_pmecorrV_ps(__m128 z2)
 +{
 +    const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
 +    const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
 +    const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
 +    const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
 +    const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
 +    const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
 +    const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
 +
 +    const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
 +    const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
 +    const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
 +    const __m128  VD0      = _mm_set1_ps(1.0f);
 +
 +    __m128        z4;
 +    __m128        polyVN0, polyVN1, polyVD0, polyVD1;
 +
 +    z4             = _mm_mul_ps(z2, z2);
 +
 +    polyVD1        = _mm_mul_ps(VD3, z4);
 +    polyVD0        = _mm_mul_ps(VD2, z4);
 +    polyVD1        = _mm_add_ps(polyVD1, VD1);
 +    polyVD0        = _mm_add_ps(polyVD0, VD0);
 +    polyVD1        = _mm_mul_ps(polyVD1, z2);
 +    polyVD0        = _mm_add_ps(polyVD0, polyVD1);
 +
 +    polyVD0        = gmx_mm_inv_ps(polyVD0);
 +
 +    polyVN0        = _mm_mul_ps(VN6, z4);
 +    polyVN1        = _mm_mul_ps(VN5, z4);
 +    polyVN0        = _mm_add_ps(polyVN0, VN4);
 +    polyVN1        = _mm_add_ps(polyVN1, VN3);
 +    polyVN0        = _mm_mul_ps(polyVN0, z4);
 +    polyVN1        = _mm_mul_ps(polyVN1, z4);
 +    polyVN0        = _mm_add_ps(polyVN0, VN2);
 +    polyVN1        = _mm_add_ps(polyVN1, VN1);
 +    polyVN0        = _mm_mul_ps(polyVN0, z4);
 +    polyVN1        = _mm_mul_ps(polyVN1, z2);
 +    polyVN0        = _mm_add_ps(polyVN0, VN0);
 +    polyVN0        = _mm_add_ps(polyVN0, polyVN1);
 +
 +    return _mm_mul_ps(polyVN0, polyVD0);
 +}
 +
 +
 +static int
 +gmx_mm_sincos_ps(__m128  x,
 +                 __m128 *sinval,
 +                 __m128 *cosval)
 +{
 +    const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
 +    const __m128  half        = _mm_set1_ps(0.5);
 +    const __m128  one         = _mm_set1_ps(1.0);
 +
 +    const __m128i izero      = _mm_set1_epi32(0);
 +    const __m128i ione       = _mm_set1_epi32(1);
 +    const __m128i itwo       = _mm_set1_epi32(2);
 +    const __m128i ithree     = _mm_set1_epi32(3);
 +    const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 +
 +    const __m128  CA1         = _mm_set1_ps(1.5703125f);
 +    const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
 +    const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
 +
 +    const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
 +    const __m128  CC1         = _mm_set1_ps(0.0416566950f);
 +    const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
 +    const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
 +    const __m128  CS1         = _mm_set1_ps(0.0083320758f);
 +    const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
 +
 +    __m128        y, y2;
 +    __m128        z;
 +    __m128i       iz;
 +    __m128i       offset_sin, offset_cos;
 +    __m128        tmp1, tmp2;
 +    __m128        mask_sin, mask_cos;
 +    __m128        tmp_sin, tmp_cos;
 +
 +    y          = _mm_mul_ps(x, two_over_pi);
 +    y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
 +
 +    iz         = _mm_cvttps_epi32(y);
 +    z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
 +
 +    offset_sin = _mm_and_si128(iz, ithree);
 +    offset_cos = _mm_add_epi32(iz, ione);
 +
 +    /* Extended precision arithmethic to achieve full precision */
 +    y               = _mm_mul_ps(z, CA1);
 +    tmp1            = _mm_mul_ps(z, CA2);
 +    tmp2            = _mm_mul_ps(z, CA3);
 +    y               = _mm_sub_ps(x, y);
 +    y               = _mm_sub_ps(y, tmp1);
 +    y               = _mm_sub_ps(y, tmp2);
 +
 +    y2              = _mm_mul_ps(y, y);
 +
 +    tmp1            = _mm_mul_ps(CC0, y2);
 +    tmp1            = _mm_add_ps(tmp1, CC1);
 +    tmp2            = _mm_mul_ps(CS0, y2);
 +    tmp2            = _mm_add_ps(tmp2, CS1);
 +    tmp1            = _mm_mul_ps(tmp1, y2);
 +    tmp1            = _mm_add_ps(tmp1, CC2);
 +    tmp2            = _mm_mul_ps(tmp2, y2);
 +    tmp2            = _mm_add_ps(tmp2, CS2);
 +
 +    tmp1            = _mm_mul_ps(tmp1, y2);
 +    tmp1            = _mm_add_ps(tmp1, one);
 +
 +    tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
 +    tmp2            = _mm_add_ps(tmp2, y);
 +
 +    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
 +    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
 +
 +    tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
 +    tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
 +
 +    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
 +    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
 +
 +    tmp1            = _mm_xor_ps(signbit, tmp_sin);
 +    tmp2            = _mm_xor_ps(signbit, tmp_cos);
 +
 +    *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
 +    *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
 +
 +    return 0;
 +}
 +
 +/*
 + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 + * will then call the sincos() routine and waste a factor 2 in performance!
 + */
 +static __m128
 +gmx_mm_sin_ps(__m128 x)
 +{
 +    __m128 s, c;
 +    gmx_mm_sincos_ps(x, &s, &c);
 +    return s;
 +}
 +
 +/*
 + * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
 + * will then call the sincos() routine and waste a factor 2 in performance!
 + */
 +static __m128
 +gmx_mm_cos_ps(__m128 x)
 +{
 +    __m128 s, c;
 +    gmx_mm_sincos_ps(x, &s, &c);
 +    return c;
 +}
 +
 +
 +static __m128
 +gmx_mm_tan_ps(__m128 x)
 +{
 +    __m128 sinval, cosval;
 +    __m128 tanval;
 +
 +    gmx_mm_sincos_ps(x, &sinval, &cosval);
 +
 +    tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
 +
 +    return tanval;
 +}
 +
 +
 +static __m128
 +gmx_mm_asin_ps(__m128 x)
 +{
 +    /* Same algorithm as cephes library */
 +    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 +    const __m128 limitlow  = _mm_set1_ps(1e-4f);
 +    const __m128 half      = _mm_set1_ps(0.5f);
 +    const __m128 one       = _mm_set1_ps(1.0f);
 +    const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
 +
 +    const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
 +    const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
 +    const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
 +    const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
 +    const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
 +
 +    __m128       sign;
 +    __m128       mask;
 +    __m128       xabs;
 +    __m128       z, z1, z2, q, q1, q2;
 +    __m128       pA, pB;
 +
 +    sign  = _mm_andnot_ps(signmask, x);
 +    xabs  = _mm_and_ps(x, signmask);
 +
 +    mask  = _mm_cmpgt_ps(xabs, half);
 +
 +    z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
 +    q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
 +    q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
 +
 +    q2    = xabs;
 +    z2    = _mm_mul_ps(q2, q2);
 +
 +    z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
 +    q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
 +
 +    z2    = _mm_mul_ps(z, z);
 +
 +    pA    = _mm_mul_ps(CC5, z2);
 +    pB    = _mm_mul_ps(CC4, z2);
 +
 +    pA    = _mm_add_ps(pA, CC3);
 +    pB    = _mm_add_ps(pB, CC2);
 +
 +    pA    = _mm_mul_ps(pA, z2);
 +    pB    = _mm_mul_ps(pB, z2);
 +
 +    pA    = _mm_add_ps(pA, CC1);
 +    pA    = _mm_mul_ps(pA, z);
 +
 +    z     = _mm_add_ps(pA, pB);
 +    z     = _mm_mul_ps(z, q);
 +    z     = _mm_add_ps(z, q);
 +
 +    q2    = _mm_sub_ps(halfpi, z);
 +    q2    = _mm_sub_ps(q2, z);
 +
 +    z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
 +
 +    mask  = _mm_cmpgt_ps(xabs, limitlow);
 +    z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
 +
 +    z = _mm_xor_ps(z, sign);
 +
 +    return z;
 +}
 +
 +
 +static __m128
 +gmx_mm_acos_ps(__m128 x)
 +{
 +    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 +    const __m128 one_ps    = _mm_set1_ps(1.0f);
 +    const __m128 half_ps   = _mm_set1_ps(0.5f);
 +    const __m128 pi_ps     = _mm_set1_ps(M_PI);
 +    const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
 +
 +    __m128       mask1;
 +    __m128       mask2;
 +    __m128       xabs;
 +    __m128       z, z1, z2, z3;
 +
 +    xabs  = _mm_and_ps(x, signmask);
 +    mask1 = _mm_cmpgt_ps(xabs, half_ps);
 +    mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
 +
 +    z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
 +    z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
 +    z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
 +
 +    z     = _mm_blendv_ps(x, z, mask1);
 +    z     = gmx_mm_asin_ps(z);
 +
 +    z2    = _mm_add_ps(z, z);
 +    z1    = _mm_sub_ps(pi_ps, z2);
 +    z3    = _mm_sub_ps(halfpi_ps, z);
 +
 +    z     = _mm_blendv_ps(z1, z2, mask2);
 +    z     = _mm_blendv_ps(z3, z, mask1);
 +
 +    return z;
 +}
 +
 +
 +static __m128
 +gmx_mm_atan_ps(__m128 x)
 +{
 +    /* Same algorithm as cephes library */
 +    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
 +    const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
 +    const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
 +    const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
 +    const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
 +    const __m128 mone      = _mm_set1_ps(-1.0f);
 +    const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
 +    const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
 +    const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
 +    const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
 +
 +    __m128       sign;
 +    __m128       mask1, mask2;
 +    __m128       y, z1, z2;
 +    __m128       x2, x4;
 +    __m128       sum1, sum2;
 +
 +    sign  = _mm_andnot_ps(signmask, x);
 +    x     = _mm_and_ps(x, signmask);
 +
 +    mask1 = _mm_cmpgt_ps(x, limit1);
 +    mask2 = _mm_cmpgt_ps(x, limit2);
 +
 +    z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
 +    z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
 +
 +    y     = _mm_and_ps(mask1, quarterpi);
 +    y     = _mm_blendv_ps(y, halfpi, mask2);
 +
 +    x     = _mm_blendv_ps(x, z1, mask1);
 +    x     = _mm_blendv_ps(x, z2, mask2);
 +
 +    x2    = _mm_mul_ps(x, x);
 +    x4    = _mm_mul_ps(x2, x2);
 +
 +    sum1  = _mm_mul_ps(CC9, x4);
 +    sum2  = _mm_mul_ps(CC7, x4);
 +    sum1  = _mm_add_ps(sum1, CC5);
 +    sum2  = _mm_add_ps(sum2, CC3);
 +    sum1  = _mm_mul_ps(sum1, x4);
 +    sum2  = _mm_mul_ps(sum2, x2);
 +
 +    sum1  = _mm_add_ps(sum1, sum2);
 +    sum1  = _mm_sub_ps(sum1, mone);
 +    sum1  = _mm_mul_ps(sum1, x);
 +    y     = _mm_add_ps(y, sum1);
 +
 +    y     = _mm_xor_ps(y, sign);
 +
 +    return y;
 +}
 +
 +
 +static __m128
 +gmx_mm_atan2_ps(__m128 y, __m128 x)
 +{
 +    const __m128 pi          = _mm_set1_ps(M_PI);
 +    const __m128 minuspi     = _mm_set1_ps(-M_PI);
 +    const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
 +    const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
 +
 +    __m128       z, z1, z3, z4;
 +    __m128       w;
 +    __m128       maskx_lt, maskx_eq;
 +    __m128       masky_lt, masky_eq;
 +    __m128       mask1, mask2, mask3, mask4, maskall;
 +
 +    maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
 +    masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
 +    maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
 +    masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
 +
 +    z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
 +    z         = gmx_mm_atan_ps(z);
 +
 +    mask1     = _mm_and_ps(maskx_eq, masky_lt);
 +    mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
 +    mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
 +    mask4     = _mm_and_ps(masky_eq, maskx_lt);
 +
 +    maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
 +
 +    z         = _mm_andnot_ps(maskall, z);
 +    z1        = _mm_and_ps(mask1, minushalfpi);
 +    z3        = _mm_and_ps(mask3, halfpi);
 +    z4        = _mm_and_ps(mask4, pi);
 +
 +    z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
 +
 +    mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
 +    mask2     = _mm_and_ps(maskx_lt, masky_lt);
 +
 +    w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
 +    w         = _mm_andnot_ps(maskall, w);
 +
 +    z         = _mm_add_ps(z, w);
 +
 +    return z;
 +}
 +
 +
 +
 +#endif /* _gmx_math_x86_sse4_1_single_h_ */
diff --cc src/gromacs/mdlib/domdec.c
index f7b0479d70,0000000000..a713fafc3d
mode 100644,000000..100644
--- a/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.c
@@@ -1,9723 -1,0 +1,9726 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            for (i = 0; i < state_local->nrng; i++)
 +                            {
 +                                state->ld_rng[i] = state_local->ld_rng[i];
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
 +                                  state_local->ld_rng, state->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            state->ld_rngi[0] = state_local->ld_rngi[0];
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, sizeof(state->ld_rngi[0]),
 +                                  state_local->ld_rngi, state->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd,
 +                                  state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                  state->ld_rng, state_local->ld_rng);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                   state->ld_rng, state_local->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
 +                                  state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
 +                                   state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], format[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
 +                                10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +    sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out, strlen(atomname) < 4 ? format : format4,
 +                "ATOM", (ii+1)%100000,
 +                atomname, resname, ' ', resnr%10000, ' ',
 +                10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +void get_pme_nnodes(const gmx_domdec_t *dd,
 +                    int *npmenodes_x, int *npmenodes_y)
 +{
 +    if (dd != NULL)
 +    {
 +        *npmenodes_x = dd->comm->npmenodes_x;
 +        *npmenodes_y = dd->comm->npmenodes_y;
 +    }
 +    else
 +    {
 +        *npmenodes_x = 1;
 +        *npmenodes_y = 1;
 +    }
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP nodes:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
++    /* zone_ncg1[0] should always be equal to ncg_home */
++    dd->comm->zone_ncg1[0] = dd->ncg_home;
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        /* The cut-off might have changed, e.g. by PME load balacning,
 +         * from the value used to set comm->cellsize_min, so check it.
 +         */
 +        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            /* Check for the cut-off limit set by the PME load balancing */
 +            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for (j = 0; j < dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d], dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_large_int_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
 +        set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr, t_mdatoms *md,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the nodes: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME nodes\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
 +    comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
 +    recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
 +    comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
 +    comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds, comm->bInterCGMultiBody);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite, gmx_constr_t constr,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, t_forcerec *fr,
 +                       gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
 +                          rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_large_int_t    step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
-     int                i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
++    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
++        ncgindex_set = 0;
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
- 
-         cg0 = 0;
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
++        ncgindex_set = dd->ncg_home;
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
++        ncgindex_set = 0;
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr, mdatoms,
-                            !bSortCG, nrnb, &cg0, &ncg_moved);
++                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
-         cg0 = 0;
 +        ga2la_clear(dd->ga2la);
++        ncgindex_set = 0;
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
-     make_dd_indices(dd, cgs_gl->index, cg0);
++    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
 +    init_bonded_thread_force_reduction(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA, mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
diff --cc src/programs/mdrun/md.c
index 99683e8df2,0000000000..c75b17e76c
mode 100644,000000..100644
--- a/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.c
@@@ -1,2236 -1,0 +1,2236 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "pme_loadbal.h"
 +#include "bondf.h"
 +#include "membed.h"
 +#include "types/nlistheuristics.h"
 +#include "types/iteratedconstraints.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +static void reset_all_counters(FILE *fplog, t_commrec *cr,
 +                               gmx_large_int_t step,
 +                               gmx_large_int_t *step_rel, t_inputrec *ir,
 +                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
 +                               gmx_runtime_t *runtime,
 +                               nbnxn_cuda_ptr_t cu_nbv)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
 +                  gmx_step_str(step, sbuf));
 +
 +    if (cu_nbv)
 +    {
 +        nbnxn_cuda_reset_timings(cu_nbv);
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel      = 0;
 +    wallcycle_start(wcycle, ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
 +}
 +
 +double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int stepout, t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed, t_forcerec *fr,
 +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
 +             real cpt_period, real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t   *outf;
 +    gmx_large_int_t step, step_rel;
 +    double          run_time;
 +    double          t, t0, lam0[efptNR];
 +    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
 +    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
 +                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
 +                    bBornRadii, bStartingFromCpt;
 +    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 +    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
 +                      bForceUpdate = FALSE, bCPT;
 +    int               mdof_flags;
 +    gmx_bool          bMasterState;
 +    int               force_flags, cglo_flags;
 +    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
 +    int               i, m;
 +    t_trxstatus      *status;
 +    rvec              mu_tot;
 +    t_vcm            *vcm;
 +    t_state          *bufstate = NULL;
 +    matrix           *scale_tot, pcoupl_mu, M, ebox;
 +    gmx_nlheur_t      nlh;
 +    t_trxframe        rerun_fr;
 +    gmx_repl_ex_t     repl_ex = NULL;
 +    int               nchkpt  = 1;
 +    gmx_localtop_t   *top;
 +    t_mdebin         *mdebin = NULL;
 +    df_history_t      df_history;
 +    t_state          *state    = NULL;
 +    rvec             *f_global = NULL;
 +    int               n_xtc    = -1;
 +    rvec             *x_xtc    = NULL;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f = NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t      upd   = NULL;
 +    t_graph          *graph = NULL;
 +    globsig_t         gs;
 +    gmx_rng_t         mcrng = NULL;
 +    gmx_bool          bFFscan;
 +    gmx_groups_t     *groups;
 +    gmx_ekindata_t   *ekind, *ekind_save;
 +    gmx_shellfc_t     shellfc;
 +    int               count, nconverged = 0;
 +    real              timestep = 0;
 +    double            tcount   = 0;
 +    gmx_bool          bIonize  = FALSE;
 +    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
 +    gmx_bool          bAppend;
 +    gmx_bool          bResetCountersHalfMaxH = FALSE;
 +    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
 +    gmx_bool          bUpdateDoLR;
 +    real              mu_aver = 0, dvdl_constr;
 +    int               a0, a1, gnx = 0, ii;
 +    atom_id          *grpindex = NULL;
 +    char             *grpname;
 +    t_coupl_rec      *tcr     = NULL;
 +    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
 +    matrix            boxcopy = {{0}}, lastbox;
 +    tensor            tmpvir;
 +    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
 +    real              vetanew = 0;
 +    int               lamnew  = 0;
 +    /* for FEP */
 +    int               nstfep;
 +    real              rate;
 +    double            cycles;
 +    real              saved_conserved_quantity = 0;
 +    real              last_ekin                = 0;
 +    int               iter_i;
 +    t_extmass         MassQ;
 +    int             **trotter_seq;
 +    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 +    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t     iterate;
 +    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
 +                                                                      simulation stops. If equal to zero, don't
 +                                                                      communicate any more between multisims.*/
 +    /* PME load balancing data for GPU kernels */
 +    pme_load_balancing_t pme_loadbal = NULL;
 +    double               cycles_pmes;
 +    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
 +
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf, top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */
 +    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
 +                                          false in this step.  The correct value, true or false,
 +                                          is set at each step, as it depends on the frequency of temperature
 +                                          and pressure control.*/
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
 +
 +    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
 +            &(state_global->fep_state), lam0,
 +            nrnb, top_global, &upd,
 +            nfile, fnm, &outf, &mdebin,
 +            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f, top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history, &state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save, 1);
 +    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
 +    /* Copy the cos acceleration to the groups struct */
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global, n_flexible_constraints(constr),
 +                                 (ir->bContinuation ||
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
 +        if ((io > 2000) && MASTER(cr))
 +        {
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state, 1);
 +        dd_init_local_state(cr->dd, state_global, state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout)
 +        {
 +            snew(f_global, state_global->natoms);
 +        }
 +    }
 +    else
 +    {
 +        if (PAR(cr))
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +            pd_at_range(cr, &a0, &a1);
 +        }
 +        else
 +        {
 +            top = gmx_mtop_generate_local_top(top_global, ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        forcerec_set_excl_load(fr, top, cr);
 +
 +        state    = partdec_init_local_state(cr, state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, top, mdatoms, cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +
 +        if (shellfc)
 +        {
 +            make_local_shells(cr, mdatoms, shellfc);
 +        }
 +
 +        init_bonded_thread_force_reduction(fr, &top->idef);
 +
 +        if (ir->pull && PAR(cr))
 +        {
 +            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            state, &f, mdatoms, top, fr,
 +                            vsite, shellfc, constr,
 +                            nrnb, wcycle, FALSE);
 +
 +    }
 +
 +    update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if (Flags & MD_APPENDFILES)
 +            {
 +                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist, mdebin);
 +    }
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd, state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng, state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr)
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, top, ir, mdatoms, cr);
 +        }
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT, nfile, fnm);
 +    if (bTCR)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex, gnx);
 +        for (i = 0; (i < gnx); i++)
 +        {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
 +                        "repl_ex_nst", &repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
 +    }
 +
 +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
 +     * With perturbed charges with soft-core we should not change the cut-off.
 +     */
 +    if ((Flags & MD_TUNEPME) &&
 +        EEL_PME(fr->eeltype) &&
 +        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
 +        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
 +        !bRerunMD)
 +    {
 +        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
 +        cycles_pmes = 0;
 +        if (cr->duty & DUTY_PME)
 +        {
 +            /* Start tuning right away, as we can't measure the load */
 +            bPMETuneRunning = TRUE;
 +        }
 +        else
 +        {
 +            /* Separate PME nodes, we can measure the PP/PME load balance */
 +            bPMETuneTry = TRUE;
 +        }
 +    }
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for (m = 0; m < DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
 +                               graph, cr, nrnb, fr, top, shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
 +                             top->idef.iparams, top->idef.il,
 +                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +
 +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
 +    {
 +        nstfep = ir->expandedvals->nstexpanded;
 +    }
 +    if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
 +    {
 +        nstfep = repl_ex_nst;
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
 +                  | (bVV ? CGLO_PRESSURE : 0)
 +                  | (bVV ? CGLO_CONSTRAINT : 0)
 +                  | (bRerunMD ? CGLO_RERUNMD : 0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
 +
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                    constr, NULL, FALSE, state->box,
 +                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
 +    if (ir->eI == eiVVAK)
 +    {
 +        /* a second call to get the half step temperature initialized as well */
 +        /* we do the same call as above, but turn the pressure off -- internally to
 +           compute_globals, this is recognized as a velocity verlet half-step
 +           kinetic energy calculation.  This minimized excess variables, but
 +           perhaps loses some logic?*/
 +
 +        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                        constr, NULL, FALSE, state->box,
 +                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT))
 +    {
 +        for (i = 0; (i < ir->opts.ngtc); i++)
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 +        }
 +    }
 +    if (ir->eI != eiVV)
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterativeCase)
 +    {
 +        bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan)
 +    {
 +        snew(xcopy, state->natoms);
 +        snew(vcopy, state->natoms);
 +        copy_rvecn(state->x, xcopy, 0, state->natoms);
 +        copy_rvecn(state->v, vcopy, 0, state->natoms);
 +        copy_mat(state->box, boxcopy);
 +    }
 +
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 +
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr, FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr, "starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf, "%s", "infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
 +                        gmx_step_str(ir->init_step, sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr, "%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps, sbuf), tbuf);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
 +    wallcycle_start(wcycle, ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret = fcCheckPointParallel( cr->nodeid,
 +                                      NULL, 0);
 +    if (chkpt_ret == 0)
 +    {
 +        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
 +    }
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv, &status,
 +                                             opt2fn("-rerun", nfile, fnm),
 +                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms, top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box, fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX    = !bStateFromCP;
 +    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep        = FALSE;
 +    bSumEkinhOld     = FALSE;
 +    bExchanged       = FALSE;
 +
 +    init_global_signals(&gs, cr, ir, repl_ex_nst);
 +
 +    step     = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh, bGStatEveryStep, step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame))
 +    {
 +
 +        wallcycle_start(wcycle, ewcSTEP);
 +
 +        if (bRerunMD)
 +        {
 +            if (rerun_fr.bStep)
 +            {
 +                step     = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime)
 +            {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        }
 +        else
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t         = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +        {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +
 +            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
 +            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
 +            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn)
 +        {
 +            update_annealing_target_temp(&(ir->opts), t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for (i = 0; i < state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for (i = 0; i < state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box, state_global->box);
 +            copy_mat(state_global->box, state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for (ii = 0; (ii < state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii], state->x[ii]);
 +                copy_rvec(vcopy[ii], state->v[ii]);
 +            }
 +            copy_mat(boxcopy, state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS      = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        }
 +
 +        /* check whether we should stop because another simulation has
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
 +                 (multisim_nsteps != ir->nsteps) )
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr,
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep         = TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
-         if ( (gs.set[eglsSTOPCOND] < 0 ) ||
-              ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist == 0)) )
++        if ( (gs.set[eglsSTOPCOND] < 0) ||
++             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii = bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
 +        {
 +            bBornRadii = TRUE;
 +        }
 +
 +        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +            (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog, step, state->box, graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd, state, state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle, ewcDOMDEC);
 +                dd_partition_system(fplog, step, cr,
 +                                    bMasterState, nstglobalcomm,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle,
 +                                    do_verbose && !bPMETuneRunning);
 +                wallcycle_stop(wcycle, ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms, state->lambda[efptMASS]);
 +        }
 +
 +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
 +        {
 +
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                            constr, NULL, FALSE, state->box,
 +                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
 +                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
 +        }
 +
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile, fnm, fr,
 +                                  mdatoms->nr, state->x, state->box))
 +            {
 +                gmx_finalize_par();
 +
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        if (EI_VV(ir->eI) && (!bInitStep))
 +        {
 +            /* for vv, the first half of the integration actually corresponds
 +               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
 +               but the virial needs to be calculated on both the current step and the 'next' step. Future
 +               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
 +
 +            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
 +        }
 +        else
 +        {
 +            bCalcEner = do_per_step(step, ir->nstcalcenergy);
 +            bCalcVir  = bCalcEner ||
 +                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 +        }
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
 +                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcVir  = TRUE;
 +            bCalcEner = TRUE;
 +            bGStat    = TRUE;
 +        }
 +
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +                      );
 +
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
 +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +                       );
 +
 +        if (fr->bTwinRange)
 +        {
 +            if (do_per_step(step, ir->nstcalclr))
 +            {
 +                force_flags |= GMX_FORCE_DO_LR;
 +            }
 +        }
 +
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
 +                                        ir, bNS, force_flags,
 +                                        bStopCM, top, top_global,
 +                                        constr, enerd, fcd,
 +                                        state, f, force_vir, mdatoms,
 +                                        nrnb, wcycle, graph, groups,
 +                                        shellfc, fr, bBornRadii, t, mu_tot,
 +                                        state->natoms, &bConverged, vsite,
 +                                        outf->fp_field);
 +            tcount += count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too.
 +             * Check comments in sim_util.c
 +             */
 +            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
 +                     state->box, state->x, &state->hist,
 +                     f, force_vir, mdatoms, enerd, fcd,
 +                     state->lambda, graph,
 +                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
 +                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
 +        }
 +
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
 +            fprintf(fplog, "Done init_coupling\n");
 +            fflush(fplog);
 +        }
 +
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI == eiVV && bInitStep)
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
 +            }
 +            else
 +            {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
 +            }
 +
 +            /* If we are using twin-range interactions where the long-range component
 +             * is only evaluated every nstcalclr>1 steps, we should do a special update
 +             * step to combine the long-range forces on these steps.
 +             * For nstcalclr=1 this is not done, since the forces would have been added
 +             * directly to the short-range forces already.
 +             */
 +            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
 +                          f, bUpdateDoLR, fr->f_twin, fcd,
 +                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
 +                          cr, nrnb, constr, &top->idef);
 +
 +            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
 +            {
 +                gmx_iterate_init(&iterate, TRUE);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +
 +            /* save the state */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +            }
 +
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || iterate.bIterationActive)
 +            {
 +                if (iterate.bIterationActive)
 +                {
 +                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +                    if (bFirstIterate && bTrotter)
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +
 +                        veta_save = state->veta;
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
 +                        vetanew     = state->veta;
 +                        state->veta = veta_save;
 +                    }
 +                }
 +
 +                bOK = TRUE;
 +                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
 +                {
 +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, shake_vir, NULL,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       bInitStep, TRUE, bCalcVir, vetanew);
 +
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +
 +                }
 +                else if (graph)
 +                {
 +                    /* Need to unshift here if a do_force has been
 +                       called in the previous step */
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
 +                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
 +                /* for vv, the first half of the integration actually corresponds to the previous step.
 +                   So we need information from the last step in the first half of the integration */
 +                if (bGStat || do_per_step(step-1, nstglobalcomm))
 +                {
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, state->box,
 +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                    cglo_flags
 +                                    | CGLO_ENERGY
 +                                    | (bTemp ? CGLO_TEMPERATURE : 0)
 +                                    | (bPres ? CGLO_PRESSURE : 0)
 +                                    | (bPres ? CGLO_CONSTRAINT : 0)
 +                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
 +                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                    | CGLO_SCALEEKIN
 +                                    );
 +                    /* explanation of above:
 +                       a) We compute Ekin at the full time step
 +                       if 1) we are using the AveVel Ekin, and it's not the
 +                       initial step, or 2) if we are using AveEkin, but need the full
 +                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
 +                       EkinAveVel because it's needed for the pressure */
 +                }
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep)
 +                {
 +                    if (bTrotter)
 +                    {
 +                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
 +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
 +                    }
 +                    else
 +                    {
 +                        if (bExchanged)
 +                        {
 +
 +                            /* We need the kinetic energy at minus the half step for determining
 +                             * the full step kinetic energy and possibly for T-coupling.*/
 +                            /* This may not be quite working correctly yet . . . . */
 +                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
 +                                            constr, NULL, FALSE, state->box,
 +                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +                        }
 +                    }
 +                }
 +
 +                if (iterate.bIterationActive &&
 +                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                                   state->veta, &vetanew))
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep)
 +            {
 +                copy_mat(shake_vir, state->svir_prev);
 +                copy_mat(force_vir, state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
 +                {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI == eiVV)
 +            {
 +                copy_rvecn(cbuf, state->v, 0, state->natoms);
 +            }
 +        }
 +
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV)
 +        {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
 +            if (ir->eI == eiVV)
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            if (!bRerunMD)
 +            {
 +                sum_dhdl(enerd, state->lambda, ir->fepvals);
 +            }
 +        }
 +
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded)
 +        {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +
 +            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +
 +        /* Now we have the energies and forces corresponding to the
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step, ir->nstxout))
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +        if (do_per_step(step, ir->nstvout))
 +        {
 +            mdof_flags |= MDOF_V;
 +        }
 +        if (do_per_step(step, ir->nstfout))
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +        if (do_per_step(step, ir->nstxtcout))
 +        {
 +            mdof_flags |= MDOF_XTC;
 +        }
 +        if (bCPT)
 +        {
 +            mdof_flags |= MDOF_CPT;
 +        }
 +        ;
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +        {
 +            fcReportProgress( ir->nsteps, step );
 +        }
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +        {
 +            fcRequestCheckPoint();
 +        }
 +#endif
 +
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle, ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd, state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng, state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate, ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist, mdebin);
 +                    if (ir->efep != efepNO || ir->bSimTemp)
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist, &df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog, cr, outf, mdof_flags, top_global,
 +                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr, "\nWriting final coordinates.\n");
 +                if (fr->bMolPBC)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
 +                                    *top_global->name, top_global,
 +                                    state_global->x, state_global->v,
 +                                    ir->ePBC, state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle, ewcTRAJ);
 +        }
 +
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV)
 +        {
 +            copy_mat(state->svir_prev, shake_vir);
 +            copy_mat(state->fvir_prev, force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
 +            {
 +                gs.sig[eglsSTOPCOND] = 1;
 +            }
 +            if (gmx_get_stop_condition() == gmx_stop_cond_next)
 +            {
 +                gs.sig[eglsSTOPCOND] = -1;
 +            }
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition = (int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
 +                                                     nlh.scale_tot, state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 ||
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +
 +        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
 +        if (EI_VV(ir->eI))
 +        {
 +            if (!bInitStep)
 +            {
 +                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
 +            }
 +            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
 +            {
 +                gmx_bool bIfRandomize;
 +                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
 +                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +                if (constr && bIfRandomize)
 +                {
 +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir, NULL,
 +                                       cr, nrnb, wcycle, upd, constr,
 +                                       bInitStep, TRUE, bCalcVir, vetanew);
 +                }
 +            }
 +        }
 +
 +        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
 +        {
 +            gmx_iterate_init(&iterate, TRUE);
 +            /* for iterations, we save these vectors, as we will be redoing the calculations */
 +            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
 +        }
 +
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || iterate.bIterationActive)
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */
 +            if (iterate.bIterationActive)
 +            {
 +                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box, lastbox);
 +
 +            bOK = TRUE;
 +            dvdl_constr = 0;
 +
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle, ewcUPDATE);
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter)
 +                {
 +                    if (iterate.bIterationActive)
 +                    {
 +                        if (bFirstIterate)
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir, scalevir, shake_vir);
 +                        m_add(force_vir, shake_vir, total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
 +                    /* We can only do Berendsen coupling after we have summed
 +                     * the kinetic energy or virial. Since the happens
 +                     * in global_state after update, we should only do it at
 +                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                     */
 +                }
 +                else
 +                {
 +                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
 +                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
 +                                   upd, bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    /* velocity half-step update */
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
 +                                  cr, nrnb, constr, &top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    copy_rvecn(state->x, cbuf, 0, state->natoms);
 +                }
 +                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                              bUpdateDoLR, fr->f_twin, fcd,
 +                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
 +                                   fr->bMolPBC, graph, f,
 +                                   &top->idef, shake_vir, force_vir,
 +                                   cr, nrnb, wcycle, upd, constr,
 +                                   bInitStep, FALSE, bCalcVir, state->veta);
 +
 +                if (ir->eI == eiVVAK)
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                    constr, NULL, FALSE, lastbox,
 +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE
 +                                    );
 +                    wallcycle_start(wcycle, ewcUPDATE);
 +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf, state->x, 0, state->natoms);
 +
 +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 +
 +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
 +                                  bUpdateDoLR, fr->f_twin, fcd,
 +                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
 +                    wallcycle_stop(wcycle, ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure.
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
 +                                       state, fr->bMolPBC, graph, f,
 +                                       &top->idef, tmp_vir, force_vir,
 +                                       cr, nrnb, wcycle, upd, NULL,
 +                                       bInitStep, FALSE, bCalcVir,
 +                                       state->veta);
 +                }
 +                if (!bOK && !bFFscan)
 +                {
 +                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +
 +                if (fr->bSepDVDL && fplog && do_log)
 +                {
 +                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
 +                }
 +                if (bVV)
 +                {
 +                    /* this factor or 2 correction is necessary
 +                       because half of the constraint force is removed
 +                       in the vv step, so we have to double it.  See
 +                       the Redmine issue #1255.  It is not yet clear
 +                       if the factor of 2 is exact, or just a very
 +                       good approximation, and this will be
 +                       investigated.  The next step is to see if this
 +                       can be done adding a dhdl contribution from the
 +                       rattle step, but this is somewhat more
 +                       complicated with the current code. Will be
 +                       investigated, hopefully for 4.6.3. However,
 +                       this current solution is much better than
 +                       having it completely wrong.
 +                    */
 +                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
 +                }
 +                else
 +                {
 +                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +                }
 +            }
 +            else if (graph)
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph, state->box, state->x);
 +            }
 +
 +            if (vsite != NULL)
 +            {
 +                wallcycle_start(wcycle, ewcVSITECONSTR);
 +                if (graph != NULL)
 +                {
 +                    shift_self(graph, state->box, state->x);
 +                }
 +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
 +                                 top->idef.iparams, top->idef.il,
 +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +
 +                if (graph != NULL)
 +                {
 +                    unshift_self(graph, state->box, state->x);
 +                }
 +                wallcycle_stop(wcycle, ewcVSITECONSTR);
 +            }
 +
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
 +            /* With Leap-Frog we can skip compute_globals at
 +             * non-communication steps, but we need to calculate
 +             * the kinetic energy one step before communication.
 +             */
 +            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
 +            {
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
 +                }
 +                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
 +                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
 +                                constr,
 +                                bFirstIterate ? &gs : NULL,
 +                                (step_rel % gs.nstms == 0) &&
 +                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
 +                                lastbox,
 +                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
 +                                cglo_flags
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
 +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
 +                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_CONSTRAINT
 +                                );
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    nlh.nabnsb         = gs.set[eglsNABNSB];
 +                    gs.set[eglsNABNSB] = 0;
 +                }
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (iterate.bIterationActive &&
 +                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
 +                               trace(shake_vir), &tracevir))
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        if (!bVV || bRerunMD)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd, state->lambda, ir->fepvals);
 +        }
 +        update_box(fplog, step, ir, mdatoms, state, graph, f,
 +                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
 +
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc == NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
 +                                 f, NULL, xcopy,
 +                                 &(top_global->mols), mdatoms->massT, pres))
 +            {
 +                gmx_finalize_par();
 +
 +                fprintf(stderr, "\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,
 +             * so signal that we still have to do it.
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies.
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
 +                        ir, MASTER(cr),
 +                        mdatoms, &(top->idef), mu_aver,
 +                        top_global->mols.nr, cr,
 +                        state->box, total_vir, pres,
 +                        mu_tot, state->x, f, bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI == eiVV)
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize)
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max)
 +            {
 +                fprintf(stderr, "Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep)
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr, do_or;
 +
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
 +                                          &df_history, state->fep_state, ir->nstlog, step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
 +            {
 +                if (bCalcEner)
 +                {
 +                    upd_mdebin(mdebin, bDoDHDL, TRUE,
 +                               t, mdatoms->tmass, enerd, state,
 +                               ir->fepvals, ir->expandedvals, lastbox,
 +                               shake_vir, force_vir, total_vir, pres,
 +                               ekind, mu_tot, constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +
 +                do_dr  = do_per_step(step, ir->nstdisreout);
 +                do_or  = do_per_step(step, ir->nstorireout);
 +
 +                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
 +                           step, t,
 +                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull, step, t);
 +            }
 +
 +            if (do_per_step(step, ir->nstlog))
 +            {
 +                if (fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
 +        {
 +            if (shellfc)
 +            {
 +                fprintf(stderr, "\n");
 +            }
 +            print_time(stderr, runtime, step, ir, cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step, repl_ex_nst))
 +        {
 +            bExchanged = replica_exchange(fplog, cr, repl_ex,
 +                                          state_global, enerd,
 +                                          state, step, t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr))
 +            {
 +                dd_partition_system(fplog, step, cr, TRUE, 1,
 +                                    state_global, top_global, ir,
 +                                    state, &f, mdatoms, top, fr,
 +                                    vsite, shellfc, constr,
 +                                    nrnb, wcycle, FALSE);
 +            }
 +        }
 +
 +        bFirstStep       = FALSE;
 +        bInitStep        = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres, state->pres_prev);
 +        }
 +
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed != NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel, membed, state_global->x);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
 +            }
 +        }
 +
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +
 +        cycles = wallcycle_stop(wcycle, ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd, cycles, ddCyclStep);
 +        }
 +
 +        if (bPMETuneRunning || bPMETuneTry)
 +        {
 +            /* PME grid + cut-off optimization with GPUs or PME nodes */
 +
 +            /* Count the total cycles over the last steps */
 +            cycles_pmes += cycles;
 +
 +            /* We can only switch cut-off at NS steps */
 +            if (step % ir->nstlist == 0)
 +            {
 +                /* PME grid + cut-off optimization with GPUs or PME nodes */
 +                if (bPMETuneTry)
 +                {
 +                    if (DDMASTER(cr->dd))
 +                    {
 +                        /* PME node load is too high, start tuning */
 +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
 +                    }
 +                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 +
 +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
 +                    {
 +                        bPMETuneTry     = FALSE;
 +                    }
 +                }
 +                if (bPMETuneRunning)
 +                {
 +                    /* init_step might not be a multiple of nstlist,
 +                     * but the first cycle is always skipped anyhow.
 +                     */
 +                    bPMETuneRunning =
 +                        pme_load_balance(pme_loadbal, cr,
 +                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
 +                                         fplog,
 +                                         ir, state, cycles_pmes,
 +                                         fr->ic, fr->nbv, &fr->pmedata,
 +                                         step);
 +
 +                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
 +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
 +                    fr->rlist      = fr->ic->rlist;
 +                    fr->rlistlong  = fr->ic->rlistlong;
 +                    fr->rcoulomb   = fr->ic->rcoulomb;
 +                    fr->rvdw       = fr->ic->rvdw;
 +                }
 +                cycles_pmes = 0;
 +            }
 +        }
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
 +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +            wcycle_set_reset_counters(wcycle, -1);
 +            if (!(cr->duty & DUTY_PME))
 +            {
 +                /* Tell our PME node to reset its counters */
 +                gmx_pme_send_resetcounters(cr, step);
 +            }
 +            /* Correct max_hours for the elapsed time */
 +            max_hours                -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH    = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +
 +    /* Stop the time */
 +    runtime_end(runtime);
 +
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD)
 +        {
 +            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
 +                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
 +    }
 +
 +    if (pme_loadbal != NULL)
 +    {
 +        pme_loadbal_done(pme_loadbal, cr, fplog,
 +                         fr->nbv != NULL && fr->nbv->bUseGPU);
 +    }
 +
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog, repl_ex);
 +    }
 +
 +    runtime->nsteps_done = step_rel;
 +
 +    return 0;
 +}