Merge remote-tracking branch 'origin/release-4-6' into HEAD
authorRoland Schulz <roland@utk.edu>
Thu, 25 Oct 2012 18:32:50 +0000 (14:32 -0400)
committerRoland Schulz <roland@utk.edu>
Thu, 25 Oct 2012 18:33:13 +0000 (14:33 -0400)
Conflicts:
include/nbnxn_search.h
src/gromacs/legacyheaders/nbnxn_search.h
src/mdlib/nbnxn_search.h
Moved the new src/mdlib/nbnxn_search.h to
     src/gromacs/mdlib/nbnxn_search.h

Conflicts (all trival):
src/gromacs/gmxlib/statutil.cpp
src/programs/mdrun/genalg.c
src/tools/gmx_h2order.c
src/tools/gmx_sorient.c

Moved src/mdlib/nbnxn_atomdata.* src/mdlib/nbnxn_internal.h
    to src/gromacs/mdlib

Change-Id: Ic0026cb8e9c2b12fcd5293add562e906193bf6ab

27 files changed:
1  2 
src/gromacs/gmxlib/bondfree.c
src/gromacs/gmxlib/gmx_omp_nthreads.c
src/gromacs/legacyheaders/constr.h
src/gromacs/legacyheaders/gmx_omp_nthreads.h
src/gromacs/legacyheaders/types/nbnxn_pairlist.h
src/gromacs/legacyheaders/vsite.h
src/gromacs/mdlib/clincs.c
src/gromacs/mdlib/constr.c
src/gromacs/mdlib/csettle.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/domdec_top.c
src/gromacs/mdlib/force.c
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/gmx_wallcycle.c
src/gromacs/mdlib/nbnxn_atomdata.c
src/gromacs/mdlib/nbnxn_atomdata.h
src/gromacs/mdlib/nbnxn_consts.h
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/nbnxn_search.h
src/gromacs/mdlib/shakef.c
src/gromacs/mdlib/sim_util.c
src/gromacs/mdlib/update.c
src/gromacs/mdlib/vsite.c
src/programs/mdrun/runner.c

index b6fd93be2bcc7ce5f25fceb4b35e3af5e0dbfa1c,0000000000000000000000000000000000000000..57377b5d1afdc8c936db48444c695280276421a9
mode 100644,000000..100644
--- /dev/null
@@@ -1,4005 -1,0 +1,4008 @@@
-                 dvdl[efptFTYPE] = 0;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include "physics.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "txtdump.h"
 +#include "bondf.h"
 +#include "smalloc.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "macros.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "mshift.h"
 +#include "main.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +
 +#if !defined GMX_DOUBLE && defined GMX_X86_SSE2
 +#include "gmx_x86_simd_single.h"
 +#define SSE_PROPER_DIHEDRALS
 +#endif
 +
 +/* Find a better place for this? */
 +const int cmap_coeff_matrix[] = {
 +1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4 ,
 +0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
 +0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  9, -6,  0,  0, -6,  4 ,
 +0, 0,  3, -2, 0, 0,  0,  0,  0,  0, -9,  6,  0,  0,  6, -4,
 +0, 0,  0,  0, 1, 0, -3,  2, -2,  0,  6, -4,  1,  0, -3,  2 ,
 +0, 0,  0,  0, 0, 0,  0,  0, -1,  0,  3, -2,  1,  0, -3,  2 ,
 +0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  2,  0,  0,  3, -2,
 +0, 0,  0,  0, 0, 0,  3, -2,  0,  0, -6,  4,  0,  0,  3, -2,
 +0, 1, -2,  1, 0, 0,  0,  0,  0, -3,  6, -3,  0,  2, -4,  2 ,
 +0, 0,  0,  0, 0, 0,  0,  0,  0,  3, -6,  3,  0, -2,  4, -2,
 +0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  3,  0,  0,  2, -2,
 +0, 0, -1,  1, 0, 0,  0,  0,  0,  0,  3, -3,  0,  0, -2,  2 ,
 +0, 0,  0,  0, 0, 1, -2,  1,  0, -2,  4, -2,  0,  1, -2,  1,
 +0, 0,  0,  0, 0, 0,  0,  0,  0, -1,  2, -1,  0,  1, -2,  1,
 +0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  1, -1,  0,  0, -1,  1,
 +0, 0,  0,  0, 0, 0, -1,  1,  0,  0,  2, -2,  0,  0, -1,  1
 +};
 +
 +
 +
 +int glatnr(int *global_atom_index,int i)
 +{
 +    int atnr;
 +
 +    if (global_atom_index == NULL) {
 +        atnr = i + 1;
 +    } else {
 +        atnr = global_atom_index[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +static int pbc_rvec_sub(const t_pbc *pbc,const rvec xi,const rvec xj,rvec dx)
 +{
 +  if (pbc) {
 +    return pbc_dx_aiuc(pbc,xi,xj,dx);
 +  }
 +  else {
 +    rvec_sub(xi,xj,dx);
 +    return CENTRAL;
 +  }
 +}
 +
 +/*
 + * Morse potential bond by Frank Everdij
 + *
 + * Three parameters needed:
 + *
 + * b0 = equilibrium distance in nm
 + * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
 + * cb = well depth in kJ/mol
 + *
 + * Note: the potential is referenced to be +cb at infinite separation
 + *       and zero at the equilibrium distance!
 + */
 +
 +real morse_bonds(int nbonds,
 +               const t_iatom forceatoms[],const t_iparams forceparams[],
 +               const rvec x[],rvec f[],rvec fshift[],
 +               const t_pbc *pbc,const t_graph *g,
 +               real lambda,real *dvdlambda,
 +               const t_mdatoms *md,t_fcdata *fcd,
 +               int *global_atom_index)
 +{
 +  const real one=1.0;
 +  const real two=2.0;
 +  real  dr,dr2,temp,omtemp,cbomtemp,fbond,vbond,fij,vtot;
 +  real  b0,be,cb,b0A,beA,cbA,b0B,beB,cbB,L1;
 +  rvec  dx;
 +  int   i,m,ki,type,ai,aj;
 +  ivec  dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    
 +    b0A   = forceparams[type].morse.b0A;
 +    beA   = forceparams[type].morse.betaA;
 +    cbA   = forceparams[type].morse.cbA;
 +
 +    b0B   = forceparams[type].morse.b0B;
 +    beB   = forceparams[type].morse.betaB;
 +    cbB   = forceparams[type].morse.cbB;
 +
 +    L1 = one-lambda;                      /* 1 */
 +    b0 = L1*b0A + lambda*b0B;             /* 3 */
 +    be = L1*beA + lambda*beB;             /* 3 */
 +    cb = L1*cbA + lambda*cbB;             /* 3 */
 +
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);            /*   3          */
 +    dr2  = iprod(dx,dx);                            /*   5          */
 +    dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
 +    temp = exp(-be*(dr-b0));                        /*  12          */
 +    
 +    if (temp == one)
 +    {
 +        /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
 +        *dvdlambda += cbB-cbA;
 +        continue;
 +    }
 +
 +    omtemp   = one-temp;                               /*   1          */
 +    cbomtemp = cb*omtemp;                              /*   1          */
 +    vbond    = cbomtemp*omtemp;                        /*   1          */
 +    fbond    = -two*be*temp*cbomtemp*gmx_invsqrt(dr2); /*   9          */
 +    vtot     += vbond;                                 /*   1          */
 +
 +    *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
 +    
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki = IVEC2IS(dt);
 +    }
 +
 +    for (m=0; (m<DIM); m++) {                          /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                           /*  83 TOTAL    */
 +  return vtot;
 +}
 +
 +real cubic_bonds(int nbonds,
 +               const t_iatom forceatoms[],const t_iparams forceparams[],
 +               const rvec x[],rvec f[],rvec fshift[],
 +               const t_pbc *pbc,const t_graph *g,
 +               real lambda,real *dvdlambda,
 +               const t_mdatoms *md,t_fcdata *fcd,
 +               int *global_atom_index)
 +{
 +  const real three = 3.0;
 +  const real two   = 2.0;
 +  real  kb,b0,kcub;
 +  real  dr,dr2,dist,kdist,kdist2,fbond,vbond,fij,vtot;
 +  rvec  dx;
 +  int   i,m,ki,type,ai,aj;
 +  ivec  dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    
 +    b0   = forceparams[type].cubic.b0;
 +    kb   = forceparams[type].cubic.kb;
 +    kcub = forceparams[type].cubic.kcub;
 +
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);                /*   3          */
 +    dr2  = iprod(dx,dx);                                /*   5          */
 +    
 +    if (dr2 == 0.0)
 +      continue;
 +      
 +    dr         = dr2*gmx_invsqrt(dr2);                      /*  10          */
 +    dist       = dr-b0;
 +    kdist      = kb*dist;
 +    kdist2     = kdist*dist;
 +    
 +    vbond      = kdist2 + kcub*kdist2*dist;
 +    fbond      = -(two*kdist + three*kdist2*kcub)/dr;
 +
 +    vtot      += vbond;       /* 21 */
 +    
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                          /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                           /*  54 TOTAL    */
 +  return vtot;
 +}
 +
 +real FENE_bonds(int nbonds,
 +              const t_iatom forceatoms[],const t_iparams forceparams[],
 +              const rvec x[],rvec f[],rvec fshift[],
 +              const t_pbc *pbc,const t_graph *g,
 +              real lambda,real *dvdlambda,
 +              const t_mdatoms *md,t_fcdata *fcd,
 +              int *global_atom_index)
 +{
 +  const real half=0.5;
 +  const real one=1.0;
 +  real  bm,kb;
 +  real  dr,dr2,bm2,omdr2obm2,fbond,vbond,fij,vtot;
 +  rvec  dx;
 +  int   i,m,ki,type,ai,aj;
 +  ivec  dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    
 +    bm   = forceparams[type].fene.bm;
 +    kb   = forceparams[type].fene.kb;
 +
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);            /*   3          */
 +    dr2  = iprod(dx,dx);                                /*   5          */
 +    
 +    if (dr2 == 0.0)
 +      continue;
 +
 +    bm2 = bm*bm;
 +
 +    if (dr2 >= bm2)
 +      gmx_fatal(FARGS,
 +              "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
 +              dr2,bm2,
 +              glatnr(global_atom_index,ai),
 +              glatnr(global_atom_index,aj));
 +      
 +    omdr2obm2  = one - dr2/bm2;
 +    
 +    vbond      = -half*kb*bm2*log(omdr2obm2);
 +    fbond      = -kb/omdr2obm2;
 +
 +    vtot      += vbond;       /* 35 */
 +    
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                          /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                           /*  58 TOTAL    */
 +  return vtot;
 +}
 +
 +real harmonic(real kA,real kB,real xA,real xB,real x,real lambda,
 +            real *V,real *F)
 +{
 +  const real half=0.5;
 +  real  L1,kk,x0,dx,dx2;
 +  real  v,f,dvdlambda;
 +  
 +  L1    = 1.0-lambda;
 +  kk    = L1*kA+lambda*kB;
 +  x0    = L1*xA+lambda*xB;
 +
 +  dx    = x-x0;
 +  dx2   = dx*dx;
 +
 +  f     = -kk*dx;
 +  v     = half*kk*dx2;
 +  dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
 +
 +  *F    = f;
 +  *V    = v;
 +
 +  return dvdlambda;
 +
 +  /* That was 19 flops */
 +}
 +
 +
 +real bonds(int nbonds,
 +         const t_iatom forceatoms[],const t_iparams forceparams[],
 +         const rvec x[],rvec f[],rvec fshift[],
 +         const t_pbc *pbc,const t_graph *g,
 +         real lambda,real *dvdlambda,
 +         const t_mdatoms *md,t_fcdata *fcd,
 +         int *global_atom_index)
 +{
 +  int  i,m,ki,ai,aj,type;
 +  real dr,dr2,fbond,vbond,fij,vtot;
 +  rvec dx;
 +  ivec dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +  
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);  /*   3          */
 +    dr2  = iprod(dx,dx);                      /*   5          */
 +    dr   = dr2*gmx_invsqrt(dr2);                      /*  10          */
 +
 +    *dvdlambda += harmonic(forceparams[type].harmonic.krA,
 +                           forceparams[type].harmonic.krB,
 +                           forceparams[type].harmonic.rA,
 +                           forceparams[type].harmonic.rB,
 +                           dr,lambda,&vbond,&fbond);  /*  19  */
 +
 +    if (dr2 == 0.0)
 +      continue;
 +
 +    
 +    vtot  += vbond;/* 1*/
 +    fbond *= gmx_invsqrt(dr2);                        /*   6          */
 +#ifdef DEBUG
 +    if (debug)
 +      fprintf(debug,"BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +            dr,vbond,fbond);
 +#endif
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                 /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                   /* 59 TOTAL     */
 +  return vtot;
 +}
 +
 +real restraint_bonds(int nbonds,
 +                     const t_iatom forceatoms[],const t_iparams forceparams[],
 +                     const rvec x[],rvec f[],rvec fshift[],
 +                     const t_pbc *pbc,const t_graph *g,
 +                     real lambda,real *dvdlambda,
 +                     const t_mdatoms *md,t_fcdata *fcd,
 +                     int *global_atom_index)
 +{
 +    int  i,m,ki,ai,aj,type;
 +    real dr,dr2,fbond,vbond,fij,vtot;
 +    real L1;
 +    real low,dlow,up1,dup1,up2,dup2,k,dk;
 +    real drh,drh2;
 +    rvec dx;
 +    ivec dt;
 +
 +    L1   = 1.0 - lambda;
 +
 +    vtot = 0.0;
 +    for(i=0; (i<nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        
 +        ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);      /*   3          */
 +        dr2  = iprod(dx,dx);                          /*   5          */
 +        dr   = dr2*gmx_invsqrt(dr2);                  /*  10          */
 +
 +        low  = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
 +        dlow =   -forceparams[type].restraint.lowA +        forceparams[type].restraint.lowB;
 +        up1  = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
 +        dup1 =   -forceparams[type].restraint.up1A +        forceparams[type].restraint.up1B;
 +        up2  = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
 +        dup2 =   -forceparams[type].restraint.up2A +        forceparams[type].restraint.up2B;
 +        k    = L1*forceparams[type].restraint.kA   + lambda*forceparams[type].restraint.kB;
 +        dk   =   -forceparams[type].restraint.kA   +        forceparams[type].restraint.kB;
 +        /* 24 */
 +
 +        if (dr < low)
 +        {
 +            drh   = dr - low;
 +            drh2  = drh*drh;
 +            vbond = 0.5*k*drh2;
 +            fbond = -k*drh;
 +            *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
 +        } /* 11 */
 +        else if (dr <= up1)
 +        {
 +            vbond = 0;
 +            fbond = 0;
 +        }
 +        else if (dr <= up2)
 +        {
 +            drh   = dr - up1;
 +            drh2  = drh*drh;
 +            vbond = 0.5*k*drh2;
 +            fbond = -k*drh;
 +            *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
 +        } /* 11       */
 +        else
 +        {
 +            drh   = dr - up2;
 +            vbond = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
 +            fbond = -k*(up2 - up1);
 +            *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
 +                + k*(dup2 - dup1)*(up2 - up1 + drh)
 +                - k*(up2 - up1)*dup2;
 +        }
 +   
 +        if (dr2 == 0.0)
 +            continue;
 +        
 +        vtot  += vbond;/* 1*/
 +        fbond *= gmx_invsqrt(dr2);                    /*   6          */
 +#ifdef DEBUG
 +        if (debug)
 +            fprintf(debug,"BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +                    dr,vbond,fbond);
 +#endif
 +        if (g) {
 +            ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +            ki=IVEC2IS(dt);
 +        }
 +        for (m=0; (m<DIM); m++) {                     /*  15          */
 +            fij=fbond*dx[m];
 +            f[ai][m]+=fij;
 +            f[aj][m]-=fij;
 +            fshift[ki][m]+=fij;
 +            fshift[CENTRAL][m]-=fij;
 +        }
 +    }                                 /* 59 TOTAL     */
 +
 +    return vtot;
 +}
 +
 +real polarize(int nbonds,
 +            const t_iatom forceatoms[],const t_iparams forceparams[],
 +            const rvec x[],rvec f[],rvec fshift[],
 +            const t_pbc *pbc,const t_graph *g,
 +            real lambda,real *dvdlambda,
 +            const t_mdatoms *md,t_fcdata *fcd,
 +            int *global_atom_index)
 +{
 +  int  i,m,ki,ai,aj,type;
 +  real dr,dr2,fbond,vbond,fij,vtot,ksh;
 +  rvec dx;
 +  ivec dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ksh  = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
 +    if (debug)
 +      fprintf(debug,"POL: local ai = %d aj = %d ksh = %.3f\n",ai,aj,ksh);
 +  
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);  /*   3          */
 +    dr2  = iprod(dx,dx);                      /*   5          */
 +    dr   = dr2*gmx_invsqrt(dr2);                      /*  10          */
 +
 +    *dvdlambda += harmonic(ksh,ksh,0,0,dr,lambda,&vbond,&fbond);  /*  19  */
 +
 +    if (dr2 == 0.0)
 +      continue;
 +    
 +    vtot  += vbond;/* 1*/
 +    fbond *= gmx_invsqrt(dr2);                        /*   6          */
 +
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                 /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                   /* 59 TOTAL     */
 +  return vtot;
 +}
 +
 +real anharm_polarize(int nbonds,
 +                     const t_iatom forceatoms[],const t_iparams forceparams[],
 +                     const rvec x[],rvec f[],rvec fshift[],
 +                     const t_pbc *pbc,const t_graph *g,
 +                     real lambda,real *dvdlambda,
 +                     const t_mdatoms *md,t_fcdata *fcd,
 +                     int *global_atom_index)
 +{
 +  int  i,m,ki,ai,aj,type;
 +  real dr,dr2,fbond,vbond,fij,vtot,ksh,khyp,drcut,ddr,ddr3;
 +  rvec dx;
 +  ivec dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type  = forceatoms[i++];
 +    ai    = forceatoms[i++];
 +    aj    = forceatoms[i++];
 +    ksh   = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
 +    khyp  = forceparams[type].anharm_polarize.khyp;
 +    drcut = forceparams[type].anharm_polarize.drcut;
 +    if (debug)
 +      fprintf(debug,"POL: local ai = %d aj = %d ksh = %.3f\n",ai,aj,ksh);
 +  
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);  /*   3          */
 +    dr2  = iprod(dx,dx);                      /*   5          */
 +    dr   = dr2*gmx_invsqrt(dr2);                      /*  10          */
 +
 +    *dvdlambda += harmonic(ksh,ksh,0,0,dr,lambda,&vbond,&fbond);  /*  19  */
 +
 +    if (dr2 == 0.0)
 +      continue;
 +    
 +    if (dr > drcut) {
 +        ddr    = dr-drcut;
 +        ddr3   = ddr*ddr*ddr;
 +        vbond += khyp*ddr*ddr3;
 +        fbond -= 4*khyp*ddr3;
 +    }
 +    fbond *= gmx_invsqrt(dr2);                        /*   6          */
 +    vtot  += vbond;/* 1*/
 +
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                 /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                   /* 72 TOTAL     */
 +  return vtot;
 +}
 +
 +real water_pol(int nbonds,
 +             const t_iatom forceatoms[],const t_iparams forceparams[],
 +             const rvec x[],rvec f[],rvec fshift[],
 +             const t_pbc *pbc,const t_graph *g,
 +             real lambda,real *dvdlambda,
 +             const t_mdatoms *md,t_fcdata *fcd,
 +             int *global_atom_index)
 +{
 +  /* This routine implements anisotropic polarizibility for water, through
 +   * a shell connected to a dummy with spring constant that differ in the
 +   * three spatial dimensions in the molecular frame.
 +   */
 +  int  i,m,aO,aH1,aH2,aD,aS,type,type0;
 +  rvec dOH1,dOH2,dHH,dOD,dDS,nW,kk,dx,kdx,proj;
 +#ifdef DEBUG
 +  rvec df;
 +#endif
 +  real vtot,fij,r_HH,r_OD,r_nW,tx,ty,tz,qS;
 +
 +  vtot = 0.0;
 +  if (nbonds > 0) {
 +    type0  = forceatoms[0];
 +    aS     = forceatoms[5];
 +    qS     = md->chargeA[aS];
 +    kk[XX] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
 +    kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
 +    kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
 +    r_HH   = 1.0/forceparams[type0].wpol.rHH;
 +    r_OD   = 1.0/forceparams[type0].wpol.rOD;
 +    if (debug) {
 +      fprintf(debug,"WPOL: qS  = %10.5f aS = %5d\n",qS,aS);
 +      fprintf(debug,"WPOL: kk  = %10.3f        %10.3f        %10.3f\n",
 +            kk[XX],kk[YY],kk[ZZ]);
 +      fprintf(debug,"WPOL: rOH = %10.3f  rHH = %10.3f  rOD = %10.3f\n",
 +            forceparams[type0].wpol.rOH,
 +            forceparams[type0].wpol.rHH,
 +            forceparams[type0].wpol.rOD);
 +    }
 +    for(i=0; (i<nbonds); i+=6) {
 +      type = forceatoms[i];
 +      if (type != type0)
 +      gmx_fatal(FARGS,"Sorry, type = %d, type0 = %d, file = %s, line = %d",
 +                  type,type0,__FILE__,__LINE__);
 +      aO   = forceatoms[i+1];
 +      aH1  = forceatoms[i+2];
 +      aH2  = forceatoms[i+3];
 +      aD   = forceatoms[i+4];
 +      aS   = forceatoms[i+5];
 +      
 +      /* Compute vectors describing the water frame */
 +      rvec_sub(x[aH1],x[aO], dOH1);
 +      rvec_sub(x[aH2],x[aO], dOH2);
 +      rvec_sub(x[aH2],x[aH1],dHH);
 +      rvec_sub(x[aD], x[aO], dOD);
 +      rvec_sub(x[aS], x[aD], dDS);
 +      cprod(dOH1,dOH2,nW);
 +      
 +      /* Compute inverse length of normal vector 
 +       * (this one could be precomputed, but I'm too lazy now)
 +       */
 +      r_nW = gmx_invsqrt(iprod(nW,nW));
 +      /* This is for precision, but does not make a big difference,
 +       * it can go later.
 +       */
 +      r_OD = gmx_invsqrt(iprod(dOD,dOD)); 
 +      
 +      /* Normalize the vectors in the water frame */
 +      svmul(r_nW,nW,nW);
 +      svmul(r_HH,dHH,dHH);
 +      svmul(r_OD,dOD,dOD);
 +      
 +      /* Compute displacement of shell along components of the vector */
 +      dx[ZZ] = iprod(dDS,dOD);
 +      /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
 +      for(m=0; (m<DIM); m++)
 +      proj[m] = dDS[m]-dx[ZZ]*dOD[m];
 +      
 +      /*dx[XX] = iprod(dDS,nW);
 +      dx[YY] = iprod(dDS,dHH);*/
 +      dx[XX] = iprod(proj,nW);
 +      for(m=0; (m<DIM); m++)
 +      proj[m] -= dx[XX]*nW[m];
 +      dx[YY] = iprod(proj,dHH);
 +      /*#define DEBUG*/
 +#ifdef DEBUG
 +      if (debug) {
 +      fprintf(debug,"WPOL: dx2=%10g  dy2=%10g  dz2=%10g  sum=%10g  dDS^2=%10g\n",
 +              sqr(dx[XX]),sqr(dx[YY]),sqr(dx[ZZ]),iprod(dx,dx),iprod(dDS,dDS));
 +      fprintf(debug,"WPOL: dHH=(%10g,%10g,%10g)\n",dHH[XX],dHH[YY],dHH[ZZ]);
 +      fprintf(debug,"WPOL: dOD=(%10g,%10g,%10g), 1/r_OD = %10g\n",
 +              dOD[XX],dOD[YY],dOD[ZZ],1/r_OD);
 +      fprintf(debug,"WPOL: nW =(%10g,%10g,%10g), 1/r_nW = %10g\n",
 +              nW[XX],nW[YY],nW[ZZ],1/r_nW);
 +      fprintf(debug,"WPOL: dx  =%10g, dy  =%10g, dz  =%10g\n",
 +              dx[XX],dx[YY],dx[ZZ]);
 +      fprintf(debug,"WPOL: dDSx=%10g, dDSy=%10g, dDSz=%10g\n",
 +              dDS[XX],dDS[YY],dDS[ZZ]);
 +      }
 +#endif
 +      /* Now compute the forces and energy */
 +      kdx[XX] = kk[XX]*dx[XX];
 +      kdx[YY] = kk[YY]*dx[YY];
 +      kdx[ZZ] = kk[ZZ]*dx[ZZ];
 +      vtot   += iprod(dx,kdx);
 +      for(m=0; (m<DIM); m++) {
 +      /* This is a tensor operation but written out for speed */
 +      tx        =  nW[m]*kdx[XX];
 +      ty        = dHH[m]*kdx[YY];
 +      tz        = dOD[m]*kdx[ZZ];
 +      fij       = -tx-ty-tz;
 +#ifdef DEBUG
 +      df[m] = fij;
 +#endif
 +      f[aS][m] += fij;
 +      f[aD][m] -= fij;
 +      }
 +#ifdef DEBUG
 +      if (debug) {
 +      fprintf(debug,"WPOL: vwpol=%g\n",0.5*iprod(dx,kdx));
 +      fprintf(debug,"WPOL: df = (%10g, %10g, %10g)\n",df[XX],df[YY],df[ZZ]);
 +      }
 +#endif
 +    } 
 +  }
 +  return 0.5*vtot;
 +}
 +
 +static real do_1_thole(const rvec xi,const rvec xj,rvec fi,rvec fj,
 +                     const t_pbc *pbc,real qq,
 +                     rvec fshift[],real afac)
 +{
 +  rvec r12;
 +  real r12sq,r12_1,r12n,r12bar,v0,v1,fscal,ebar,fff;
 +  int  m,t;
 +    
 +  t      = pbc_rvec_sub(pbc,xi,xj,r12); /*  3 */
 +  
 +  r12sq  = iprod(r12,r12);              /*  5 */
 +  r12_1  = gmx_invsqrt(r12sq);              /*  5 */
 +  r12bar = afac/r12_1;                  /*  5 */
 +  v0     = qq*ONE_4PI_EPS0*r12_1;       /*  2 */
 +  ebar   = exp(-r12bar);                /*  5 */
 +  v1     = (1-(1+0.5*r12bar)*ebar);     /*  4 */
 +  fscal  = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
 +  if (debug)
 +    fprintf(debug,"THOLE: v0 = %.3f v1 = %.3f r12= % .3f r12bar = %.3f fscal = %.3f  ebar = %.3f\n",v0,v1,1/r12_1,r12bar,fscal,ebar);
 +  
 +  for(m=0; (m<DIM); m++) {
 +    fff    = fscal*r12[m];
 +    fi[m] += fff;
 +    fj[m] -= fff;             
 +    fshift[t][m]       += fff;
 +    fshift[CENTRAL][m] -= fff;
 +  } /* 15 */
 +  
 +  return v0*v1; /* 1 */
 +  /* 54 */
 +}
 +
 +real thole_pol(int nbonds,
 +             const t_iatom forceatoms[],const t_iparams forceparams[],
 +             const rvec x[],rvec f[],rvec fshift[],
 +             const t_pbc *pbc,const t_graph *g,
 +             real lambda,real *dvdlambda,
 +             const t_mdatoms *md,t_fcdata *fcd,
 +             int *global_atom_index)
 +{
 +  /* Interaction between two pairs of particles with opposite charge */
 +  int i,type,a1,da1,a2,da2;
 +  real q1,q2,qq,a,al1,al2,afac;
 +  real V=0;
 +  
 +  for(i=0; (i<nbonds); ) {
 +    type  = forceatoms[i++];
 +    a1    = forceatoms[i++];
 +    da1   = forceatoms[i++];
 +    a2    = forceatoms[i++];
 +    da2   = forceatoms[i++];
 +    q1    = md->chargeA[da1];
 +    q2    = md->chargeA[da2];
 +    a     = forceparams[type].thole.a;
 +    al1   = forceparams[type].thole.alpha1;
 +    al2   = forceparams[type].thole.alpha2;
 +    qq    = q1*q2;
 +    afac  = a*pow(al1*al2,-1.0/6.0);
 +    V += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq,fshift,afac);
 +    V += do_1_thole(x[da1],x[a2], f[da1],f[a2], pbc,-qq,fshift,afac);
 +    V += do_1_thole(x[a1], x[da2],f[a1], f[da2],pbc,-qq,fshift,afac);
 +    V += do_1_thole(x[da1],x[da2],f[da1],f[da2],pbc, qq,fshift,afac);
 +  }
 +  /* 290 flops */
 +  return V;
 +}
 +
 +real bond_angle(const rvec xi,const rvec xj,const rvec xk,const t_pbc *pbc,
 +              rvec r_ij,rvec r_kj,real *costh,
 +              int *t1,int *t2)
 +/* Return value is the angle between the bonds i-j and j-k */
 +{
 +  /* 41 FLOPS */
 +  real th;
 +  
 +  *t1 = pbc_rvec_sub(pbc,xi,xj,r_ij);                 /*  3           */
 +  *t2 = pbc_rvec_sub(pbc,xk,xj,r_kj);                 /*  3           */
 +
 +  *costh=cos_angle(r_ij,r_kj);                /* 25           */
 +  th=acos(*costh);                    /* 10           */
 +                                      /* 41 TOTAL     */
 +  return th;
 +}
 +
 +real angles(int nbonds,
 +            const t_iatom forceatoms[],const t_iparams forceparams[],
 +            const rvec x[],rvec f[],rvec fshift[],
 +            const t_pbc *pbc,const t_graph *g,
 +            real lambda,real *dvdlambda,
 +            const t_mdatoms *md,t_fcdata *fcd,
 +            int *global_atom_index)
 +{
 +    int  i,ai,aj,ak,t1,t2,type;
 +    rvec r_ij,r_kj;
 +    real cos_theta,cos_theta2,theta,dVdt,va,vtot;
 +    ivec jt,dt_ij,dt_kj;
 +
 +    vtot = 0.0;
 +    for(i=0; i<nbonds; )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +
 +        theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
 +                            r_ij,r_kj,&cos_theta,&t1,&t2);    /*  41          */
 +  
 +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
 +                               forceparams[type].harmonic.krB,
 +                               forceparams[type].harmonic.rA*DEG2RAD,
 +                               forceparams[type].harmonic.rB*DEG2RAD,
 +                               theta,lambda,&va,&dVdt);  /*  21  */
 +        vtot += va;
 +
 +        cos_theta2 = sqr(cos_theta);
 +        if (cos_theta2 < 1)
 +        {
 +            int  m;
 +            real st,sth;
 +            real cik,cii,ckk;
 +            real nrkj2,nrij2;
 +            real nrkj_1,nrij_1;
 +            rvec f_i,f_j,f_k;
 +
 +            st  = dVdt*gmx_invsqrt(1 - cos_theta2);   /*  12          */
 +            sth = st*cos_theta;                       /*   1          */
 +#ifdef DEBUG
 +            if (debug)
 +                fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +                        theta*RAD2DEG,va,dVdt);
 +#endif
 +            nrij2 = iprod(r_ij,r_ij);                 /*   5          */
 +            nrkj2 = iprod(r_kj,r_kj);                 /*   5          */
 +
 +            nrij_1 = gmx_invsqrt(nrij2);              /*  10          */
 +            nrkj_1 = gmx_invsqrt(nrkj2);              /*  10          */
 +
 +            cik = st*nrij_1*nrkj_1;                   /*   2          */
 +            cii = sth*nrij_1*nrij_1;                  /*   2          */
 +            ckk = sth*nrkj_1*nrkj_1;                  /*   2          */
 +      
 +            for (m=0; m<DIM; m++)
 +            {                 /*  39          */
 +                f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
 +                f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
 +                f_j[m]    = -f_i[m] - f_k[m];
 +                f[ai][m] += f_i[m];
 +                f[aj][m] += f_j[m];
 +                f[ak][m] += f_k[m];
 +            }
 +            if (g != NULL)
 +            {
 +                copy_ivec(SHIFT_IVEC(g,aj),jt);
 +
 +                ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +                ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +                t1 = IVEC2IS(dt_ij);
 +                t2 = IVEC2IS(dt_kj);
 +            }
 +            rvec_inc(fshift[t1],f_i);
 +            rvec_inc(fshift[CENTRAL],f_j);
 +            rvec_inc(fshift[t2],f_k);
 +        }                                           /* 161 TOTAL      */
 +    }
 +
 +    return vtot;
 +}
 +
 +real linear_angles(int nbonds,
 +                   const t_iatom forceatoms[],const t_iparams forceparams[],
 +                   const rvec x[],rvec f[],rvec fshift[],
 +                   const t_pbc *pbc,const t_graph *g,
 +                   real lambda,real *dvdlambda,
 +                   const t_mdatoms *md,t_fcdata *fcd,
 +                   int *global_atom_index)
 +{
 +  int  i,m,ai,aj,ak,t1,t2,type;
 +  rvec f_i,f_j,f_k;
 +  real L1,kA,kB,aA,aB,dr,dr2,va,vtot,a,b,klin;
 +  ivec jt,dt_ij,dt_kj;
 +  rvec r_ij,r_kj,r_ik,dx;
 +    
 +  L1   = 1-lambda;
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    
 +    kA = forceparams[type].linangle.klinA;
 +    kB = forceparams[type].linangle.klinB;
 +    klin = L1*kA + lambda*kB;
 +    
 +    aA   = forceparams[type].linangle.aA;
 +    aB   = forceparams[type].linangle.aB;
 +    a    = L1*aA+lambda*aB;
 +    b    = 1-a;
 +    
 +    t1 = pbc_rvec_sub(pbc,x[ai],x[aj],r_ij);
 +    t2 = pbc_rvec_sub(pbc,x[ak],x[aj],r_kj);
 +    rvec_sub(r_ij,r_kj,r_ik);
 +    
 +    dr2 = 0;
 +    for(m=0; (m<DIM); m++) 
 +    {
 +        dr     = - a * r_ij[m] - b * r_kj[m];
 +        dr2   += dr*dr;
 +        dx[m]  = dr;
 +        f_i[m] = a*klin*dr;
 +        f_k[m] = b*klin*dr;
 +        f_j[m] = -(f_i[m]+f_k[m]);
 +        f[ai][m] += f_i[m];
 +        f[aj][m] += f_j[m];
 +        f[ak][m] += f_k[m];
 +    }
 +    va    = 0.5*klin*dr2;
 +    *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx,r_ik); 
 +            
 +    vtot += va;
 +    
 +    if (g) {
 +        copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +        ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +        ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +        t1=IVEC2IS(dt_ij);
 +        t2=IVEC2IS(dt_kj);
 +    }
 +    rvec_inc(fshift[t1],f_i);
 +    rvec_inc(fshift[CENTRAL],f_j);
 +    rvec_inc(fshift[t2],f_k);
 +  }                                           /* 57 TOTAL     */
 +  return vtot;
 +}
 +
 +real urey_bradley(int nbonds,
 +                const t_iatom forceatoms[],const t_iparams forceparams[],
 +                const rvec x[],rvec f[],rvec fshift[],
 +                const t_pbc *pbc,const t_graph *g,
 +                real lambda,real *dvdlambda,
 +                const t_mdatoms *md,t_fcdata *fcd,
 +                int *global_atom_index)
 +{
 +  int  i,m,ai,aj,ak,t1,t2,type,ki;
 +  rvec r_ij,r_kj,r_ik;
 +  real cos_theta,cos_theta2,theta;
 +  real dVdt,va,vtot,dr,dr2,vbond,fbond,fik;
 +  real kthA,th0A,kUBA,r13A,kthB,th0B,kUBB,r13B;
 +  ivec jt,dt_ij,dt_kj,dt_ik;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
 +    kthA  = forceparams[type].u_b.kthetaA;
 +    r13A  = forceparams[type].u_b.r13A;
 +    kUBA  = forceparams[type].u_b.kUBA;
 +    th0B  = forceparams[type].u_b.thetaB*DEG2RAD;
 +    kthB  = forceparams[type].u_b.kthetaB;
 +    r13B  = forceparams[type].u_b.r13B;
 +    kUBB  = forceparams[type].u_b.kUBB;
 +    
 +    theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
 +                      r_ij,r_kj,&cos_theta,&t1,&t2);  /*  41          */
 +  
 +    *dvdlambda += harmonic(kthA,kthB,th0A,th0B,theta,lambda,&va,&dVdt);  /*  21  */
 +    vtot += va;
 +    
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[ak],r_ik);        /*   3          */
 +    dr2  = iprod(r_ik,r_ik);                  /*   5          */
 +    dr   = dr2*gmx_invsqrt(dr2);                      /*  10          */
 +
 +    *dvdlambda += harmonic(kUBA,kUBB,r13A,r13B,dr,lambda,&vbond,&fbond); /*  19  */
 +
 +    cos_theta2 = sqr(cos_theta);                /*   1                */
 +    if (cos_theta2 < 1) {
 +      real st,sth;
 +      real cik,cii,ckk;
 +      real nrkj2,nrij2;
 +      rvec f_i,f_j,f_k;
 +      
 +      st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12          */
 +      sth = st*cos_theta;                     /*   1          */
 +#ifdef DEBUG
 +      if (debug)
 +      fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +              theta*RAD2DEG,va,dVdt);
 +#endif
 +      nrkj2=iprod(r_kj,r_kj);                 /*   5          */
 +      nrij2=iprod(r_ij,r_ij);
 +      
 +      cik=st*gmx_invsqrt(nrkj2*nrij2);                /*  12          */ 
 +      cii=sth/nrij2;                          /*  10          */
 +      ckk=sth/nrkj2;                          /*  10          */
 +      
 +      for (m=0; (m<DIM); m++) {                       /*  39          */
 +      f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
 +      f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
 +      f_j[m]=-f_i[m]-f_k[m];
 +      f[ai][m]+=f_i[m];
 +      f[aj][m]+=f_j[m];
 +      f[ak][m]+=f_k[m];
 +      }
 +      if (g) {
 +      copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +      ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +      }
 +      rvec_inc(fshift[t1],f_i);
 +      rvec_inc(fshift[CENTRAL],f_j);
 +      rvec_inc(fshift[t2],f_k);
 +    }                                           /* 161 TOTAL  */
 +    /* Time for the bond calculations */
 +    if (dr2 == 0.0)
 +      continue;
 +
 +    vtot  += vbond;  /* 1*/
 +    fbond *= gmx_invsqrt(dr2);                        /*   6          */
 +    
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,ak),dt_ik);
 +      ki=IVEC2IS(dt_ik);
 +    }
 +    for (m=0; (m<DIM); m++) {                 /*  15          */
 +      fik=fbond*r_ik[m];
 +      f[ai][m]+=fik;
 +      f[ak][m]-=fik;
 +      fshift[ki][m]+=fik;
 +      fshift[CENTRAL][m]-=fik;
 +    }
 +  }
 +  return vtot;
 +}
 +
 +real quartic_angles(int nbonds,
 +                  const t_iatom forceatoms[],const t_iparams forceparams[],
 +                  const rvec x[],rvec f[],rvec fshift[],
 +                  const t_pbc *pbc,const t_graph *g,
 +                  real lambda,real *dvdlambda,
 +                  const t_mdatoms *md,t_fcdata *fcd,
 +                  int *global_atom_index)
 +{
 +  int  i,j,ai,aj,ak,t1,t2,type;
 +  rvec r_ij,r_kj;
 +  real cos_theta,cos_theta2,theta,dt,dVdt,va,dtp,c,vtot;
 +  ivec jt,dt_ij,dt_kj;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +
 +    theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
 +                      r_ij,r_kj,&cos_theta,&t1,&t2);  /*  41          */
 +
 +    dt = theta - forceparams[type].qangle.theta*DEG2RAD; /* 2          */
 +
 +    dVdt = 0;
 +    va = forceparams[type].qangle.c[0];
 +    dtp = 1.0;
 +    for(j=1; j<=4; j++) {
 +      c = forceparams[type].qangle.c[j];
 +      dVdt -= j*c*dtp;
 +      dtp *= dt;
 +      va += c*dtp;
 +    }
 +    /* 20 */
 +
 +    vtot += va;
 +    
 +    cos_theta2 = sqr(cos_theta);                /*   1                */
 +    if (cos_theta2 < 1) {
 +      int  m;
 +      real st,sth;
 +      real cik,cii,ckk;
 +      real nrkj2,nrij2;
 +      rvec f_i,f_j,f_k;
 +      
 +      st  = dVdt*gmx_invsqrt(1 - cos_theta2);         /*  12          */
 +      sth = st*cos_theta;                     /*   1          */
 +#ifdef DEBUG
 +      if (debug)
 +      fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +              theta*RAD2DEG,va,dVdt);
 +#endif
 +      nrkj2=iprod(r_kj,r_kj);                 /*   5          */
 +      nrij2=iprod(r_ij,r_ij);
 +      
 +      cik=st*gmx_invsqrt(nrkj2*nrij2);                /*  12          */ 
 +      cii=sth/nrij2;                          /*  10          */
 +      ckk=sth/nrkj2;                          /*  10          */
 +      
 +      for (m=0; (m<DIM); m++) {                       /*  39          */
 +      f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
 +      f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
 +      f_j[m]=-f_i[m]-f_k[m];
 +      f[ai][m]+=f_i[m];
 +      f[aj][m]+=f_j[m];
 +      f[ak][m]+=f_k[m];
 +      }
 +      if (g) {
 +      copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +      ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +      }
 +      rvec_inc(fshift[t1],f_i);
 +      rvec_inc(fshift[CENTRAL],f_j);
 +      rvec_inc(fshift[t2],f_k);
 +    }                                           /* 153 TOTAL  */
 +  }
 +  return vtot;
 +}
 +
 +real dih_angle(const rvec xi,const rvec xj,const rvec xk,const rvec xl,
 +               const t_pbc *pbc,
 +               rvec r_ij,rvec r_kj,rvec r_kl,rvec m,rvec n,
 +               real *sign,int *t1,int *t2,int *t3)
 +{
 +  real ipr,phi;
 +
 +  *t1 = pbc_rvec_sub(pbc,xi,xj,r_ij);                 /*  3           */
 +  *t2 = pbc_rvec_sub(pbc,xk,xj,r_kj);                 /*  3           */
 +  *t3 = pbc_rvec_sub(pbc,xk,xl,r_kl);                 /*  3           */
 +
 +  cprod(r_ij,r_kj,m);                         /*  9           */
 +  cprod(r_kj,r_kl,n);                 /*  9           */
 +  phi=gmx_angle(m,n);                         /* 49 (assuming 25 for atan2) */
 +  ipr=iprod(r_ij,n);                  /*  5           */
 +  (*sign)=(ipr<0.0)?-1.0:1.0;
 +  phi=(*sign)*phi;                    /*  1           */
 +                                      /* 82 TOTAL     */
 +  return phi;
 +}
 +
 +
 +#ifdef SSE_PROPER_DIHEDRALS
 +
 +/* x86 SIMD inner-product of 4 float vectors */
 +#define GMX_MM_IPROD_PS(ax,ay,az,bx,by,bz)                 \
 +    _mm_add_ps(_mm_add_ps(_mm_mul_ps(ax,bx),_mm_mul_ps(ay,by)),_mm_mul_ps(az,bz))
 +
 +/* x86 SIMD norm^2 of 4 float vectors */
 +#define GMX_MM_NORM2_PS(ax,ay,az) GMX_MM_IPROD_PS(ax,ay,az,ax,ay,az)
 +
 +/* x86 SIMD cross-product of 4 float vectors */
 +#define GMX_MM_CPROD_PS(ax,ay,az,bx,by,bz,cx,cy,cz)        \
 +{                                                          \
 +    cx = _mm_sub_ps(_mm_mul_ps(ay,bz),_mm_mul_ps(az,by));  \
 +    cy = _mm_sub_ps(_mm_mul_ps(az,bx),_mm_mul_ps(ax,bz));  \
 +    cz = _mm_sub_ps(_mm_mul_ps(ax,by),_mm_mul_ps(ay,bx));  \
 +}
 +
 +/* load 4 rvec's into 3 x86 SIMD float registers */
 +#define load_rvec4(r0,r1,r2,r3,rx_SSE,ry_SSE,rz_SSE)          \
 +{                                                             \
 +    __m128 tmp;                                               \
 +    rx_SSE = _mm_load_ps(r0);                                 \
 +    ry_SSE = _mm_load_ps(r1);                                 \
 +    rz_SSE = _mm_load_ps(r2);                                 \
 +    tmp    = _mm_load_ps(r3);                                 \
 +    _MM_TRANSPOSE4_PS(rx_SSE,ry_SSE,rz_SSE,tmp);              \
 +}
 +
 +#define store_rvec4(rx_SSE,ry_SSE,rz_SSE,r0,r1,r2,r3)         \
 +{                                                             \
 +    __m128 tmp=_mm_setzero_ps();                              \
 +    _MM_TRANSPOSE4_PS(rx_SSE,ry_SSE,rz_SSE,tmp);              \
 +    _mm_store_ps(r0,rx_SSE);                                  \
 +    _mm_store_ps(r1,ry_SSE);                                  \
 +    _mm_store_ps(r2,rz_SSE);                                  \
 +    _mm_store_ps(r3,tmp   );                                  \
 +}
 +
 +/* An rvec in a structure which can be allocated 16-byte aligned */
 +typedef struct {
 +    rvec  v;
 +    float f;
 +} rvec_sse_t;
 +
 +/* As dih_angle above, but calculates 4 dihedral angles at once using SSE,
 + * also calculates the pre-factor required for the dihedral force update.
 + * Note that bv and buf should be 16-byte aligned.
 + */
 +static void
 +dih_angle_sse(const rvec *x,
 +              int ai[4],int aj[4],int ak[4],int al[4],
 +              const t_pbc *pbc,
 +              int t1[4],int t2[4],int t3[4],
 +              rvec_sse_t *bv,
 +              real *buf)
 +{
 +    int s;
 +    __m128 rijx_SSE,rijy_SSE,rijz_SSE;
 +    __m128 rkjx_SSE,rkjy_SSE,rkjz_SSE;
 +    __m128 rklx_SSE,rkly_SSE,rklz_SSE;
 +    __m128 mx_SSE,my_SSE,mz_SSE;
 +    __m128 nx_SSE,ny_SSE,nz_SSE;
 +    __m128 cx_SSE,cy_SSE,cz_SSE;
 +    __m128 cn_SSE;
 +    __m128 s_SSE;
 +    __m128 phi_SSE;
 +    __m128 ipr_SSE;
 +    int signs;
 +    __m128 iprm_SSE,iprn_SSE;
 +    __m128 nrkj2_SSE,nrkj_1_SSE,nrkj_2_SSE,nrkj_SSE;
 +    __m128 nrkj_m2_SSE,nrkj_n2_SSE;
 +    __m128 p_SSE,q_SSE;
 +    __m128 fmin_SSE=_mm_set1_ps(GMX_FLOAT_MIN);
 +
 +    for(s=0; s<4; s++)
 +    {
 +        t1[s] = pbc_rvec_sub(pbc,x[ai[s]],x[aj[s]],bv[0+s].v);
 +        t2[s] = pbc_rvec_sub(pbc,x[ak[s]],x[aj[s]],bv[4+s].v);
 +        t3[s] = pbc_rvec_sub(pbc,x[ak[s]],x[al[s]],bv[8+s].v);
 +    }
 +
 +    load_rvec4(bv[0].v,bv[1].v,bv[2].v,bv[3].v,rijx_SSE,rijy_SSE,rijz_SSE);
 +    load_rvec4(bv[4].v,bv[5].v,bv[6].v,bv[7].v,rkjx_SSE,rkjy_SSE,rkjz_SSE);
 +    load_rvec4(bv[8].v,bv[9].v,bv[10].v,bv[11].v,rklx_SSE,rkly_SSE,rklz_SSE);
 +
 +    GMX_MM_CPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
 +                    rkjx_SSE,rkjy_SSE,rkjz_SSE,
 +                    mx_SSE,my_SSE,mz_SSE);
 +
 +    GMX_MM_CPROD_PS(rkjx_SSE,rkjy_SSE,rkjz_SSE,
 +                    rklx_SSE,rkly_SSE,rklz_SSE,
 +                    nx_SSE,ny_SSE,nz_SSE);
 +
 +    GMX_MM_CPROD_PS(mx_SSE,my_SSE,mz_SSE,
 +                    nx_SSE,ny_SSE,nz_SSE,
 +                    cx_SSE,cy_SSE,cz_SSE);
 +
 +    cn_SSE = gmx_mm_sqrt_ps(GMX_MM_NORM2_PS(cx_SSE,cy_SSE,cz_SSE));
 +    
 +    s_SSE = GMX_MM_IPROD_PS(mx_SSE,my_SSE,mz_SSE,nx_SSE,ny_SSE,nz_SSE);
 +
 +    phi_SSE = gmx_mm_atan2_ps(cn_SSE,s_SSE);
 +    _mm_store_ps(buf+16,phi_SSE);
 +
 +    ipr_SSE = GMX_MM_IPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
 +                              nx_SSE,ny_SSE,nz_SSE);
 +
 +    signs = _mm_movemask_ps(ipr_SSE);
 +    
 +    for(s=0; s<4; s++)
 +    {
 +        if (signs & (1<<s))
 +        {
 +            buf[16+s] = -buf[16+s];
 +        }
 +    }
 +
 +    iprm_SSE    = GMX_MM_NORM2_PS(mx_SSE,my_SSE,mz_SSE);
 +    iprn_SSE    = GMX_MM_NORM2_PS(nx_SSE,ny_SSE,nz_SSE);
 +
 +    /* store_rvec4 messes with the input, don't use it after this! */
 +    store_rvec4(mx_SSE,my_SSE,mz_SSE,bv[0].v,bv[1].v,bv[2].v,bv[3].v);
 +    store_rvec4(nx_SSE,ny_SSE,nz_SSE,bv[4].v,bv[5].v,bv[6].v,bv[7].v);
 +
 +    nrkj2_SSE   = GMX_MM_NORM2_PS(rkjx_SSE,rkjy_SSE,rkjz_SSE);
 +
 +    /* Avoid division by zero. When zero, the result is multiplied by 0
 +     * anyhow, so the 3 max below do not affect the final result.
 +     */
 +    nrkj2_SSE   = _mm_max_ps(nrkj2_SSE,fmin_SSE);
 +    nrkj_1_SSE  = gmx_mm_invsqrt_ps(nrkj2_SSE);
 +    nrkj_2_SSE  = _mm_mul_ps(nrkj_1_SSE,nrkj_1_SSE);
 +    nrkj_SSE    = _mm_mul_ps(nrkj2_SSE,nrkj_1_SSE);
 +
 +    iprm_SSE    = _mm_max_ps(iprm_SSE,fmin_SSE);
 +    iprn_SSE    = _mm_max_ps(iprn_SSE,fmin_SSE);
 +    nrkj_m2_SSE = _mm_mul_ps(nrkj_SSE,gmx_mm_inv_ps(iprm_SSE));
 +    nrkj_n2_SSE = _mm_mul_ps(nrkj_SSE,gmx_mm_inv_ps(iprn_SSE));
 +
 +    _mm_store_ps(buf+0,nrkj_m2_SSE);
 +    _mm_store_ps(buf+4,nrkj_n2_SSE);
 +
 +    p_SSE       = GMX_MM_IPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
 +                                  rkjx_SSE,rkjy_SSE,rkjz_SSE);
 +    p_SSE       = _mm_mul_ps(p_SSE,nrkj_2_SSE);
 +
 +    q_SSE       = GMX_MM_IPROD_PS(rklx_SSE,rkly_SSE,rklz_SSE,
 +                                  rkjx_SSE,rkjy_SSE,rkjz_SSE);
 +    q_SSE       = _mm_mul_ps(q_SSE,nrkj_2_SSE);
 +
 +    _mm_store_ps(buf+8 ,p_SSE);
 +    _mm_store_ps(buf+12,q_SSE);
 +}
 +
 +#endif /* SSE_PROPER_DIHEDRALS */
 +
 +
 +void do_dih_fup(int i,int j,int k,int l,real ddphi,
 +              rvec r_ij,rvec r_kj,rvec r_kl,
 +              rvec m,rvec n,rvec f[],rvec fshift[],
 +              const t_pbc *pbc,const t_graph *g,
 +              const rvec x[],int t1,int t2,int t3)
 +{
 +  /* 143 FLOPS */
 +  rvec f_i,f_j,f_k,f_l;
 +  rvec uvec,vvec,svec,dx_jl;
 +  real iprm,iprn,nrkj,nrkj2,nrkj_1,nrkj_2;
 +  real a,b,p,q,toler;
 +  ivec jt,dt_ij,dt_kj,dt_lj;  
 +  
 +  iprm  = iprod(m,m);         /*  5   */
 +  iprn  = iprod(n,n);         /*  5   */
 +  nrkj2 = iprod(r_kj,r_kj);   /*  5   */
 +  toler = nrkj2*GMX_REAL_EPS;
 +  if ((iprm > toler) && (iprn > toler)) {
 +    nrkj_1 = gmx_invsqrt(nrkj2);      /* 10   */
 +    nrkj_2 = nrkj_1*nrkj_1;   /*  1   */
 +    nrkj  = nrkj2*nrkj_1;     /*  1   */
 +    a     = -ddphi*nrkj/iprm; /* 11   */
 +    svmul(a,m,f_i);           /*  3   */
 +    b     = ddphi*nrkj/iprn;  /* 11   */
 +    svmul(b,n,f_l);           /*  3   */
 +    p     = iprod(r_ij,r_kj); /*  5   */
 +    p    *= nrkj_2;           /*  1   */
 +    q     = iprod(r_kl,r_kj); /*  5   */
 +    q    *= nrkj_2;           /*  1   */
 +    svmul(p,f_i,uvec);                /*  3   */
 +    svmul(q,f_l,vvec);                /*  3   */
 +    rvec_sub(uvec,vvec,svec); /*  3   */
 +    rvec_sub(f_i,svec,f_j);   /*  3   */
 +    rvec_add(f_l,svec,f_k);   /*  3   */
 +    rvec_inc(f[i],f_i);       /*  3   */
 +    rvec_dec(f[j],f_j);       /*  3   */
 +    rvec_dec(f[k],f_k);       /*  3   */
 +    rvec_inc(f[l],f_l);       /*  3   */
 +    
 +    if (g) {
 +      copy_ivec(SHIFT_IVEC(g,j),jt);
 +      ivec_sub(SHIFT_IVEC(g,i),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,k),jt,dt_kj);
 +      ivec_sub(SHIFT_IVEC(g,l),jt,dt_lj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +      t3=IVEC2IS(dt_lj);
 +    } else if (pbc) {
 +      t3 = pbc_rvec_sub(pbc,x[l],x[j],dx_jl);
 +    } else {
 +      t3 = CENTRAL;
 +    }
 +    
 +    rvec_inc(fshift[t1],f_i);
 +    rvec_dec(fshift[CENTRAL],f_j);
 +    rvec_dec(fshift[t2],f_k);
 +    rvec_inc(fshift[t3],f_l);
 +  }
 +  /* 112 TOTAL        */
 +}
 +
 +/* As do_dih_fup above, but without shift forces */
 +static void
 +do_dih_fup_noshiftf(int i,int j,int k,int l,real ddphi,
 +                    rvec r_ij,rvec r_kj,rvec r_kl,
 +                    rvec m,rvec n,rvec f[])
 +{
 +  rvec f_i,f_j,f_k,f_l;
 +  rvec uvec,vvec,svec,dx_jl;
 +  real iprm,iprn,nrkj,nrkj2,nrkj_1,nrkj_2;
 +  real a,b,p,q,toler;
 +  ivec jt,dt_ij,dt_kj,dt_lj;  
 +  
 +  iprm  = iprod(m,m);         /*  5   */
 +  iprn  = iprod(n,n);         /*  5   */
 +  nrkj2 = iprod(r_kj,r_kj);   /*  5   */
 +  toler = nrkj2*GMX_REAL_EPS;
 +  if ((iprm > toler) && (iprn > toler)) {
 +    nrkj_1 = gmx_invsqrt(nrkj2);      /* 10   */
 +    nrkj_2 = nrkj_1*nrkj_1;   /*  1   */
 +    nrkj  = nrkj2*nrkj_1;     /*  1   */
 +    a     = -ddphi*nrkj/iprm; /* 11   */
 +    svmul(a,m,f_i);           /*  3   */
 +    b     = ddphi*nrkj/iprn;  /* 11   */
 +    svmul(b,n,f_l);           /*  3   */
 +    p     = iprod(r_ij,r_kj); /*  5   */
 +    p    *= nrkj_2;           /*  1   */
 +    q     = iprod(r_kl,r_kj); /*  5   */
 +    q    *= nrkj_2;           /*  1   */
 +    svmul(p,f_i,uvec);                /*  3   */
 +    svmul(q,f_l,vvec);                /*  3   */
 +    rvec_sub(uvec,vvec,svec); /*  3   */
 +    rvec_sub(f_i,svec,f_j);   /*  3   */
 +    rvec_add(f_l,svec,f_k);   /*  3   */
 +    rvec_inc(f[i],f_i);       /*  3   */
 +    rvec_dec(f[j],f_j);       /*  3   */
 +    rvec_dec(f[k],f_k);       /*  3   */
 +    rvec_inc(f[l],f_l);       /*  3   */
 +  }
 +}
 +
 +/* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
 +static void
 +do_dih_fup_noshiftf_precalc(int i,int j,int k,int l,real ddphi,
 +                            real nrkj_m2,real nrkj_n2,
 +                            real p,real q,
 +                            rvec m,rvec n,rvec f[])
 +{
 +    rvec f_i,f_j,f_k,f_l;
 +    rvec uvec,vvec,svec,dx_jl;
 +    real a,b,toler;
 +    ivec jt,dt_ij,dt_kj,dt_lj;  
 +  
 +    a = -ddphi*nrkj_m2;
 +    svmul(a,m,f_i);
 +    b =  ddphi*nrkj_n2;
 +    svmul(b,n,f_l);
 +    svmul(p,f_i,uvec);
 +    svmul(q,f_l,vvec);
 +    rvec_sub(uvec,vvec,svec);
 +    rvec_sub(f_i,svec,f_j);
 +    rvec_add(f_l,svec,f_k);
 +    rvec_inc(f[i],f_i);
 +    rvec_dec(f[j],f_j);
 +    rvec_dec(f[k],f_k);
 +    rvec_inc(f[l],f_l);
 +}
 +
 +
 +real dopdihs(real cpA,real cpB,real phiA,real phiB,int mult,
 +           real phi,real lambda,real *V,real *F)
 +{
 +  real v,dvdlambda,mdphi,v1,sdphi,ddphi;
 +  real L1   = 1.0 - lambda;
 +  real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +  real dph0 = (phiB - phiA)*DEG2RAD;
 +  real cp   = L1*cpA + lambda*cpB;
 +  
 +  mdphi =  mult*phi - ph0;
 +  sdphi = sin(mdphi);
 +  ddphi = -cp*mult*sdphi;
 +  v1    = 1.0 + cos(mdphi);
 +  v     = cp*v1;
 +  
 +  dvdlambda  = (cpB - cpA)*v1 + cp*dph0*sdphi;
 +  
 +  *V = v;
 +  *F = ddphi;
 +  
 +  return dvdlambda;
 +  
 +  /* That was 40 flops */
 +}
 +
 +static void
 +dopdihs_noener(real cpA,real cpB,real phiA,real phiB,int mult,
 +               real phi,real lambda,real *F)
 +{
 +  real mdphi,sdphi,ddphi;
 +  real L1   = 1.0 - lambda;
 +  real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +  real cp   = L1*cpA + lambda*cpB;
 +  
 +  mdphi = mult*phi - ph0;
 +  sdphi = sin(mdphi);
 +  ddphi = -cp*mult*sdphi;
 +  
 +  *F = ddphi;
 +  
 +  /* That was 20 flops */
 +}
 +
 +static void
 +dopdihs_mdphi(real cpA,real cpB,real phiA,real phiB,int mult,
 +              real phi,real lambda,real *cp,real *mdphi)
 +{
 +    real L1   = 1.0 - lambda;
 +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +
 +    *cp    = L1*cpA + lambda*cpB;
 +
 +    *mdphi = mult*phi - ph0;
 +}
 +
 +static real dopdihs_min(real cpA,real cpB,real phiA,real phiB,int mult,
 +                      real phi,real lambda,real *V,real *F)
 +     /* similar to dopdihs, except for a minus sign  *
 +      * and a different treatment of mult/phi0       */
 +{
 +  real v,dvdlambda,mdphi,v1,sdphi,ddphi;
 +  real L1   = 1.0 - lambda;
 +  real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
 +  real dph0 = (phiB - phiA)*DEG2RAD;
 +  real cp   = L1*cpA + lambda*cpB;
 +  
 +  mdphi = mult*(phi-ph0);
 +  sdphi = sin(mdphi);
 +  ddphi = cp*mult*sdphi;
 +  v1    = 1.0-cos(mdphi);
 +  v     = cp*v1;
 +  
 +  dvdlambda  = (cpB-cpA)*v1 + cp*dph0*sdphi;
 +  
 +  *V = v;
 +  *F = ddphi;
 +  
 +  return dvdlambda;
 +  
 +  /* That was 40 flops */
 +}
 +
 +real pdihs(int nbonds,
 +         const t_iatom forceatoms[],const t_iparams forceparams[],
 +         const rvec x[],rvec f[],rvec fshift[],
 +         const t_pbc *pbc,const t_graph *g,
 +         real lambda,real *dvdlambda,
 +         const t_mdatoms *md,t_fcdata *fcd,
 +         int *global_atom_index)
 +{
 +  int  i,type,ai,aj,ak,al;
 +  int  t1,t2,t3;
 +  rvec r_ij,r_kj,r_kl,m,n;
 +  real phi,sign,ddphi,vpd,vtot;
 +
 +  vtot = 0.0;
 +
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    al   = forceatoms[i++];
 +    
 +    phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
 +                  &sign,&t1,&t2,&t3);                 /*  84          */
 +    *dvdlambda += dopdihs(forceparams[type].pdihs.cpA,
 +                          forceparams[type].pdihs.cpB,
 +                          forceparams[type].pdihs.phiA,
 +                          forceparams[type].pdihs.phiB,
 +                          forceparams[type].pdihs.mult,
 +                          phi,lambda,&vpd,&ddphi);
 +
 +    vtot += vpd;
 +    do_dih_fup(ai,aj,ak,al,ddphi,r_ij,r_kj,r_kl,m,n,
 +             f,fshift,pbc,g,x,t1,t2,t3);                      /* 112          */
 +
 +#ifdef DEBUG
 +    fprintf(debug,"pdih: (%d,%d,%d,%d) phi=%g\n",
 +          ai,aj,ak,al,phi);
 +#endif
 +  } /* 223 TOTAL      */
 +
 +  return vtot;
 +}
 +
 +void make_dp_periodic(real *dp)  /* 1 flop? */
 +{
 +    /* dp cannot be outside (-pi,pi) */
 +    if (*dp >= M_PI)
 +    {
 +        *dp -= 2*M_PI;
 +    }
 +    else if (*dp < -M_PI)
 +    {
 +        *dp += 2*M_PI;
 +    }
 +    return;
 +}
 +
 +/* As pdihs above, but without calculating energies and shift forces */
 +static void
 +pdihs_noener(int nbonds,
 +             const t_iatom forceatoms[],const t_iparams forceparams[],
 +             const rvec x[],rvec f[],
 +             const t_pbc *pbc,const t_graph *g,
 +             real lambda,
 +             const t_mdatoms *md,t_fcdata *fcd,
 +             int *global_atom_index)
 +{
 +    int  i,type,ai,aj,ak,al;
 +    int  t1,t2,t3;
 +    rvec r_ij,r_kj,r_kl,m,n;
 +    real phi,sign,ddphi_tot,ddphi;
 +
 +    for(i=0; (i<nbonds); )
 +    {
 +        ai   = forceatoms[i+1];
 +        aj   = forceatoms[i+2];
 +        ak   = forceatoms[i+3];
 +        al   = forceatoms[i+4];
 +
 +        phi = dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
 +                        &sign,&t1,&t2,&t3);
 +
 +        ddphi_tot = 0;
 +
 +        /* Loop over dihedrals working on the same atoms,
 +         * so we avoid recalculating angles and force distributions.
 +         */
 +        do
 +        {
 +            type = forceatoms[i];
 +            dopdihs_noener(forceparams[type].pdihs.cpA,
 +                           forceparams[type].pdihs.cpB,
 +                           forceparams[type].pdihs.phiA,
 +                           forceparams[type].pdihs.phiB,
 +                           forceparams[type].pdihs.mult,
 +                           phi,lambda,&ddphi);
 +            ddphi_tot += ddphi;
 +
 +            i += 5;
 +        }
 +        while(i < nbonds &&
 +              forceatoms[i+1] == ai &&
 +              forceatoms[i+2] == aj &&
 +              forceatoms[i+3] == ak &&
 +              forceatoms[i+4] == al);
 +
 +        do_dih_fup_noshiftf(ai,aj,ak,al,ddphi_tot,r_ij,r_kj,r_kl,m,n,f);
 +    }
 +}
 +
 +
 +#ifdef SSE_PROPER_DIHEDRALS
 +
 +/* As pdihs_noner above, but using SSE to calculate 4 dihedrals at once */
 +static void
 +pdihs_noener_sse(int nbonds,
 +                 const t_iatom forceatoms[],const t_iparams forceparams[],
 +                 const rvec x[],rvec f[],
 +                 const t_pbc *pbc,const t_graph *g,
 +                 real lambda,
 +                 const t_mdatoms *md,t_fcdata *fcd,
 +                 int *global_atom_index)
 +{
 +    int  i,i4,s;
 +    int  type,ai[4],aj[4],ak[4],al[4];
 +    int  t1[4],t2[4],t3[4];
 +    int  mult[4];
 +    real cp[4],mdphi[4];
 +    real ddphi;
 +    rvec_sse_t rs_array[13],*rs;
 +    real buf_array[24],*buf;
 +    __m128 mdphi_SSE,sin_SSE,cos_SSE;
 +
 +    /* Ensure 16-byte alignment */
 +    rs  = (rvec_sse_t *)(((size_t)(rs_array +1)) & (~((size_t)15)));
 +    buf =      (float *)(((size_t)(buf_array+3)) & (~((size_t)15)));
 +
 +    for(i=0; (i<nbonds); i+=20)
 +    {
 +        /* Collect atoms quadruplets for 4 dihedrals */
 +        i4 = i;
 +        for(s=0; s<4; s++)
 +        {
 +            ai[s] = forceatoms[i4+1];
 +            aj[s] = forceatoms[i4+2];
 +            ak[s] = forceatoms[i4+3];
 +            al[s] = forceatoms[i4+4];
 +            /* At the end fill the arrays with identical entries */
 +            if (i4 + 5 < nbonds)
 +            {
 +                i4 += 5;
 +            }
 +        }
 +
 +        /* Caclulate 4 dihedral angles at once */
 +        dih_angle_sse(x,ai,aj,ak,al,pbc,t1,t2,t3,rs,buf);
 +
 +        i4 = i;
 +        for(s=0; s<4; s++)
 +        {
 +            if (i4 < nbonds)
 +            {
 +                /* Calculate the coefficient and angle deviation */
 +                type = forceatoms[i4];
 +                dopdihs_mdphi(forceparams[type].pdihs.cpA,
 +                              forceparams[type].pdihs.cpB,
 +                              forceparams[type].pdihs.phiA,
 +                              forceparams[type].pdihs.phiB,
 +                              forceparams[type].pdihs.mult,
 +                              buf[16+s],lambda,&cp[s],&buf[16+s]);
 +                mult[s] = forceparams[type].pdihs.mult;
 +            }
 +            else
 +            {
 +                buf[16+s] = 0;
 +            }
 +            i4 += 5;
 +        }
 +
 +        /* Calculate 4 sines at once */
 +        mdphi_SSE = _mm_load_ps(buf+16);
 +        gmx_mm_sincos_ps(mdphi_SSE,&sin_SSE,&cos_SSE);
 +        _mm_store_ps(buf+16,sin_SSE);
 +
 +        i4 = i;
 +        s = 0;
 +        do
 +        {
 +            ddphi = -cp[s]*mult[s]*buf[16+s];
 +
 +            do_dih_fup_noshiftf_precalc(ai[s],aj[s],ak[s],al[s],ddphi,
 +                                        buf[ 0+s],buf[ 4+s],
 +                                        buf[ 8+s],buf[12+s],
 +                                        rs[0+s].v,rs[4+s].v,
 +                                        f);
 +            s++;
 +            i4 += 5;
 +        }
 +        while (s < 4 && i4 < nbonds);
 +    }
 +}
 +
 +#endif /* SSE_PROPER_DIHEDRALS */
 +
 +
 +real idihs(int nbonds,
 +         const t_iatom forceatoms[],const t_iparams forceparams[],
 +         const rvec x[],rvec f[],rvec fshift[],
 +         const t_pbc *pbc,const t_graph *g,
 +         real lambda,real *dvdlambda,
 +         const t_mdatoms *md,t_fcdata *fcd,
 +         int *global_atom_index)
 +{
 +  int  i,type,ai,aj,ak,al;
 +  int  t1,t2,t3;
 +  real phi,phi0,dphi0,ddphi,sign,vtot;
 +  rvec r_ij,r_kj,r_kl,m,n;
 +  real L1,kk,dp,dp2,kA,kB,pA,pB,dvdl_term;
 +
 +  L1 = 1.0-lambda;
 +  dvdl_term = 0;
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    al   = forceatoms[i++];
 +    
 +    phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
 +                  &sign,&t1,&t2,&t3);                 /*  84          */
 +    
 +    /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 +     * force changes if we just apply a normal harmonic.
 +     * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 +     * This means we will never have the periodicity problem, unless
 +     * the dihedral is Pi away from phiO, which is very unlikely due to
 +     * the potential.
 +     */
 +    kA = forceparams[type].harmonic.krA;
 +    kB = forceparams[type].harmonic.krB;
 +    pA = forceparams[type].harmonic.rA;
 +    pB = forceparams[type].harmonic.rB;
 +
 +    kk    = L1*kA + lambda*kB;
 +    phi0  = (L1*pA + lambda*pB)*DEG2RAD;
 +    dphi0 = (pB - pA)*DEG2RAD;
 +
 +    dp = phi-phi0;  
 +
 +    make_dp_periodic(&dp);
 +    
 +    dp2 = dp*dp;
 +
 +    vtot += 0.5*kk*dp2;
 +    ddphi = -kk*dp;
 +    
 +    dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
 +
 +    do_dih_fup(ai,aj,ak,al,(real)(-ddphi),r_ij,r_kj,r_kl,m,n,
 +             f,fshift,pbc,g,x,t1,t2,t3);                      /* 112          */
 +    /* 218 TOTAL      */
 +#ifdef DEBUG
 +    if (debug)
 +      fprintf(debug,"idih: (%d,%d,%d,%d) phi=%g\n",
 +            ai,aj,ak,al,phi);
 +#endif
 +  }
 +  
 +  *dvdlambda += dvdl_term;
 +  return vtot;
 +}
 +
 +
 +/*! \brief returns dx, rdist, and dpdl for functions posres() and fbposres()        
 + */
 +static void posres_dx(const rvec x, const rvec pos0A, const rvec pos0B,
 +                      const rvec comA_sc, const rvec comB_sc,
 +                      real lambda,
 +                      t_pbc *pbc, int refcoord_scaling,int npbcdim,
 +                      rvec dx, rvec rdist, rvec dpdl)
 +{
 +    int m,d;
 +    real posA, posB, L1, ref=0.;
 +    rvec pos;
 +
 +    L1=1.0-lambda;
 +
 +    for(m=0; m<DIM; m++)
 +    {
 +        posA = pos0A[m];
 +        posB = pos0B[m];
 +        if (m < npbcdim)
 +        {
 +            switch (refcoord_scaling)
 +            {
 +            case erscNO:
 +                ref      = 0;
 +                rdist[m] = L1*posA + lambda*posB;
 +                dpdl[m]  = posB - posA;
 +                    break;
 +            case erscALL:
 +                /* Box relative coordinates are stored for dimensions with pbc */
 +                posA *= pbc->box[m][m];
 +                posB *= pbc->box[m][m];
 +                for(d=m+1; d<npbcdim; d++)
 +                {
 +                    posA += pos0A[d]*pbc->box[d][m];
 +                    posB += pos0B[d]*pbc->box[d][m];
 +                }
 +                ref      = L1*posA + lambda*posB;
 +                rdist[m] = 0;
 +                dpdl[m]  = posB - posA;
 +                break;
 +            case erscCOM:
 +                ref      = L1*comA_sc[m] + lambda*comB_sc[m];
 +                rdist[m] = L1*posA       + lambda*posB;
 +                dpdl[m]  = comB_sc[m] - comA_sc[m] + posB - posA;
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "No such scaling method implemented");
 +            }
 +        }
 +        else
 +        {
 +            ref      = L1*posA + lambda*posB;
 +            rdist[m] = 0;
 +            dpdl[m]  = posB - posA;
 +        }
 +
 +        /* We do pbc_dx with ref+rdist,
 +         * since with only ref we can be up to half a box vector wrong.
 +         */
 +        pos[m] = ref + rdist[m];
 +    }
 +
 +    if (pbc)
 +    {
 +        pbc_dx(pbc,x,pos,dx);
 +    }
 +    else
 +    {
 +        rvec_sub(x,pos,dx);
 +    }
 +}
 +
 +/*! \brief Adds forces of flat-bottomed positions restraints to f[]
 + *         and fixes vir_diag. Returns the flat-bottomed potential. */
 +real fbposres(int nbonds,
 +              const t_iatom forceatoms[],const t_iparams forceparams[],
 +              const rvec x[],rvec f[],rvec vir_diag,
 +              t_pbc *pbc,
 +              int refcoord_scaling,int ePBC,rvec com)
 +/* compute flat-bottomed positions restraints */
 +{
 +    int  i,ai,m,d,type,npbcdim=0,fbdim;
 +    const t_iparams *pr;
 +    real vtot,kk,v;
 +    real ref=0,dr,dr2,rpot,rfb,rfb2,fact,invdr;
 +    rvec com_sc,rdist,pos,dx,dpdl,fm;
 +    gmx_bool bInvert;
 +
 +    npbcdim = ePBC2npbcdim(ePBC);
 +
 +    if (refcoord_scaling == erscCOM)
 +    {
 +        clear_rvec(com_sc);
 +        for(m=0; m<npbcdim; m++)
 +        {
 +            for(d=m; d<npbcdim; d++)
 +            {
 +                com_sc[m] += com[d]*pbc->box[d][m];
 +            }
 +        }
 +    }
 +
 +    vtot = 0.0;
 +    for(i=0; (i<nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        pr   = &forceparams[type];
 +
 +        /* same calculation as for normal posres, but with identical A and B states, and lambda==0 */
 +        posres_dx(x[ai],forceparams[type].fbposres.pos0, forceparams[type].fbposres.pos0,
 +                  com_sc, com_sc, 0.0,
 +                  pbc, refcoord_scaling, npbcdim,
 +                  dx, rdist, dpdl);
 +
 +        clear_rvec(fm);
 +        v=0.0;
 +
 +        kk=pr->fbposres.k;
 +        rfb=pr->fbposres.r;
 +        rfb2=sqr(rfb);
 +
 +        /* with rfb<0, push particle out of the sphere/cylinder/layer */
 +        bInvert=FALSE;
 +        if (rfb<0.){
 +            bInvert=TRUE;
 +            rfb=-rfb;
 +        }
 +
 +        switch (pr->fbposres.geom)
 +        {
 +        case efbposresSPHERE:
 +            /* spherical flat-bottom posres */
 +            dr2=norm2(dx);
 +            if ( dr2 > 0.0 &&
 +                 ( (dr2 > rfb2 && bInvert==FALSE ) || (dr2 < rfb2 && bInvert==TRUE ) )
 +                )
 +            {
 +                dr=sqrt(dr2);
 +                v = 0.5*kk*sqr(dr - rfb);
 +                fact = -kk*(dr-rfb)/dr;  /* Force pointing to the center pos0 */
 +                svmul(fact,dx,fm);
 +            }
 +            break;
 +        case efbposresCYLINDER:
 +            /* cylidrical flat-bottom posres in x-y plane. fm[ZZ] = 0. */
 +            dr2=sqr(dx[XX])+sqr(dx[YY]);
 +            if  ( dr2 > 0.0 &&
 +                  ( (dr2 > rfb2 && bInvert==FALSE ) || (dr2 < rfb2 && bInvert==TRUE ) )
 +                )
 +            {
 +                dr=sqrt(dr2);
 +                invdr=1./dr;
 +                v = 0.5*kk*sqr(dr - rfb);
 +                fm[XX] = -kk*(dr-rfb)*dx[XX]*invdr;  /* Force pointing to the center */
 +                fm[YY] = -kk*(dr-rfb)*dx[YY]*invdr;
 +            }
 +            break;
 +        case efbposresX: /* fbdim=XX */
 +        case efbposresY: /* fbdim=YY */
 +        case efbposresZ: /* fbdim=ZZ */
 +            /* 1D flat-bottom potential */
 +            fbdim = pr->fbposres.geom - efbposresX;
 +            dr=dx[fbdim];
 +            if ( ( dr>rfb && bInvert==FALSE ) || ( 0<dr && dr<rfb && bInvert==TRUE )  )
 +            {
 +                v = 0.5*kk*sqr(dr - rfb);
 +                fm[fbdim] = -kk*(dr - rfb);
 +            }
 +            else if ( (dr < (-rfb) && bInvert==FALSE ) || ( (-rfb)<dr && dr<0 && bInvert==TRUE ))
 +            {
 +                v = 0.5*kk*sqr(dr + rfb);
 +                fm[fbdim] = -kk*(dr + rfb);
 +            }
 +            break;
 +        }
 +
 +        vtot += v;
 +
 +        for (m=0; (m<DIM); m++)
 +        {
 +            f[ai][m]   += fm[m];
 +            /* Here we correct for the pbc_dx which included rdist */
 +            vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm[m];
 +        }
 +    }
 +
 +    return vtot;
 +}
 +
 +
 +real posres(int nbonds,
 +            const t_iatom forceatoms[],const t_iparams forceparams[],
 +            const rvec x[],rvec f[],rvec vir_diag,
 +            t_pbc *pbc,
 +            real lambda,real *dvdlambda,
 +            int refcoord_scaling,int ePBC,rvec comA,rvec comB)
 +{
 +    int  i,ai,m,d,type,ki,npbcdim=0;
 +    const t_iparams *pr;
 +    real L1;
 +    real vtot,kk,fm;
 +    real posA,posB,ref=0;
 +    rvec comA_sc,comB_sc,rdist,dpdl,pos,dx;
 +    gmx_bool bForceValid = TRUE;
 +
 +    if ((f==NULL) || (vir_diag==NULL)) {  /* should both be null together! */
 +        bForceValid = FALSE;
 +    }
 +
 +    npbcdim = ePBC2npbcdim(ePBC);
 +
 +    if (refcoord_scaling == erscCOM)
 +    {
 +        clear_rvec(comA_sc);
 +        clear_rvec(comB_sc);
 +        for(m=0; m<npbcdim; m++)
 +        {
 +            for(d=m; d<npbcdim; d++)
 +            {
 +                comA_sc[m] += comA[d]*pbc->box[d][m];
 +                comB_sc[m] += comB[d]*pbc->box[d][m];
 +            }
 +        }
 +    }
 +
 +    L1 = 1.0 - lambda;
 +
 +    vtot = 0.0;
 +    for(i=0; (i<nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        pr   = &forceparams[type];
 +        
 +        /* return dx, rdist, and dpdl */
 +        posres_dx(x[ai],forceparams[type].posres.pos0A, forceparams[type].posres.pos0B,
 +                  comA_sc, comB_sc, lambda,
 +                  pbc, refcoord_scaling, npbcdim,
 +                  dx, rdist, dpdl);
 +
 +        for (m=0; (m<DIM); m++)
 +        {
 +            kk          = L1*pr->posres.fcA[m] + lambda*pr->posres.fcB[m];
 +            fm          = -kk*dx[m];
 +            vtot       += 0.5*kk*dx[m]*dx[m];
 +            *dvdlambda +=
 +                0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
 +                -fm*dpdl[m];
 +
 +            /* Here we correct for the pbc_dx which included rdist */
 +            if (bForceValid) {
 +                f[ai][m]   += fm;
 +                vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm;
 +            }
 +        }
 +    }
 +
 +    return vtot;
 +}
 +
 +static real low_angres(int nbonds,
 +                     const t_iatom forceatoms[],const t_iparams forceparams[],
 +                     const rvec x[],rvec f[],rvec fshift[],
 +                     const t_pbc *pbc,const t_graph *g,
 +                     real lambda,real *dvdlambda,
 +                     gmx_bool bZAxis)
 +{
 +  int  i,m,type,ai,aj,ak,al;
 +  int  t1,t2;
 +  real phi,cos_phi,cos_phi2,vid,vtot,dVdphi;
 +  rvec r_ij,r_kl,f_i,f_k={0,0,0};
 +  real st,sth,nrij2,nrkl2,c,cij,ckl;
 +
 +  ivec dt;  
 +  t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
 +
 +  vtot = 0.0;
 +  ak=al=0; /* to avoid warnings */
 +  for(i=0; i<nbonds; ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    t1   = pbc_rvec_sub(pbc,x[aj],x[ai],r_ij);                /*  3           */
 +    if (!bZAxis) {      
 +      ak   = forceatoms[i++];
 +      al   = forceatoms[i++];
 +      t2   = pbc_rvec_sub(pbc,x[al],x[ak],r_kl);           /*  3              */
 +    } else {
 +      r_kl[XX] = 0;
 +      r_kl[YY] = 0;
 +      r_kl[ZZ] = 1;
 +    }
 +
 +    cos_phi = cos_angle(r_ij,r_kl);           /* 25           */
 +    phi     = acos(cos_phi);                    /* 10           */
 +
 +    *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
 +                              forceparams[type].pdihs.cpB,
 +                              forceparams[type].pdihs.phiA,
 +                              forceparams[type].pdihs.phiB,
 +                              forceparams[type].pdihs.mult,
 +                              phi,lambda,&vid,&dVdphi); /*  40  */
 +    
 +    vtot += vid;
 +
 +    cos_phi2 = sqr(cos_phi);                    /*   1                */
 +    if (cos_phi2 < 1) {
 +      st  = -dVdphi*gmx_invsqrt(1 - cos_phi2);      /*  12            */
 +      sth = st*cos_phi;                               /*   1          */
 +      nrij2 = iprod(r_ij,r_ij);                       /*   5          */
 +      nrkl2 = iprod(r_kl,r_kl);                 /*   5          */
 +      
 +      c   = st*gmx_invsqrt(nrij2*nrkl2);              /*  11          */ 
 +      cij = sth/nrij2;                                /*  10          */
 +      ckl = sth/nrkl2;                                /*  10          */
 +      
 +      for (m=0; m<DIM; m++) {                 /*  18+18       */
 +      f_i[m] = (c*r_kl[m]-cij*r_ij[m]);
 +      f[ai][m] += f_i[m];
 +      f[aj][m] -= f_i[m];
 +      if (!bZAxis) {
 +        f_k[m] = (c*r_ij[m]-ckl*r_kl[m]);
 +        f[ak][m] += f_k[m];
 +        f[al][m] -= f_k[m];
 +      }
 +      }
 +      
 +      if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      t1=IVEC2IS(dt);
 +      }
 +      rvec_inc(fshift[t1],f_i);
 +      rvec_dec(fshift[CENTRAL],f_i);
 +      if (!bZAxis) {
 +      if (g) {
 +        ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,al),dt);
 +        t2=IVEC2IS(dt);
 +      }
 +      rvec_inc(fshift[t2],f_k);
 +      rvec_dec(fshift[CENTRAL],f_k);
 +      }
 +    }
 +  }
 +
 +  return vtot;  /*  184 / 157 (bZAxis)  total  */
 +}
 +
 +real angres(int nbonds,
 +          const t_iatom forceatoms[],const t_iparams forceparams[],
 +          const rvec x[],rvec f[],rvec fshift[],
 +          const t_pbc *pbc,const t_graph *g,
 +          real lambda,real *dvdlambda,
 +          const t_mdatoms *md,t_fcdata *fcd,
 +          int *global_atom_index)
 +{
 +  return low_angres(nbonds,forceatoms,forceparams,x,f,fshift,pbc,g,
 +                  lambda,dvdlambda,FALSE);
 +}
 +
 +real angresz(int nbonds,
 +           const t_iatom forceatoms[],const t_iparams forceparams[],
 +           const rvec x[],rvec f[],rvec fshift[],
 +           const t_pbc *pbc,const t_graph *g,
 +           real lambda,real *dvdlambda,
 +           const t_mdatoms *md,t_fcdata *fcd,
 +           int *global_atom_index)
 +{
 +  return low_angres(nbonds,forceatoms,forceparams,x,f,fshift,pbc,g,
 +                    lambda,dvdlambda,TRUE);
 +}
 +
 +real dihres(int nbonds,
 +            const t_iatom forceatoms[],const t_iparams forceparams[],
 +            const rvec x[],rvec f[],rvec fshift[],
 +            const t_pbc *pbc,const t_graph *g,
 +            real lambda,real *dvdlambda,
 +            const t_mdatoms *md,t_fcdata *fcd,
 +            int *global_atom_index)
 +{
 +    real vtot = 0;
 +    int  ai,aj,ak,al,i,k,type,t1,t2,t3;
 +    real phi0A,phi0B,dphiA,dphiB,kfacA,kfacB,phi0,dphi,kfac;
 +    real phi,ddphi,ddp,ddp2,dp,sign,d2r,fc,L1;
 +    rvec r_ij,r_kj,r_kl,m,n;
 +
 +    L1 = 1.0-lambda;
 +
 +    d2r = DEG2RAD;
 +    k   = 0;
 +
 +    for (i=0; (i<nbonds); )
 +    {
 +        type = forceatoms[i++];
 +        ai   = forceatoms[i++];
 +        aj   = forceatoms[i++];
 +        ak   = forceatoms[i++];
 +        al   = forceatoms[i++];
 +
 +        phi0A  = forceparams[type].dihres.phiA*d2r;
 +        dphiA  = forceparams[type].dihres.dphiA*d2r;
 +        kfacA  = forceparams[type].dihres.kfacA;
 +
 +        phi0B  = forceparams[type].dihres.phiB*d2r;
 +        dphiB  = forceparams[type].dihres.dphiB*d2r;
 +        kfacB  = forceparams[type].dihres.kfacB;
 +
 +        phi0  = L1*phi0A + lambda*phi0B;
 +        dphi  = L1*dphiA + lambda*dphiB;
 +        kfac = L1*kfacA + lambda*kfacB;
 +
 +        phi = dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
 +                        &sign,&t1,&t2,&t3);
 +        /* 84 flops */
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"dihres[%d]: %d %d %d %d : phi=%f, dphi=%f, kfac=%f\n",
 +                    k++,ai,aj,ak,al,phi0,dphi,kfac);
 +        }
 +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 +         * force changes if we just apply a normal harmonic.
 +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 +         * This means we will never have the periodicity problem, unless
 +         * the dihedral is Pi away from phiO, which is very unlikely due to
 +         * the potential.
 +         */
 +        dp = phi-phi0;
 +        make_dp_periodic(&dp);
 +
 +        if (dp > dphi)
 +        {
 +            ddp = dp-dphi;
 +        }
 +        else if (dp < -dphi)
 +        {
 +            ddp = dp+dphi;
 +        }
 +        else
 +        {
 +            ddp = 0;
 +        }
 +
 +        if (ddp != 0.0)
 +        {
 +            ddp2 = ddp*ddp;
 +            vtot += 0.5*kfac*ddp2;
 +            ddphi = kfac*ddp;
 +
 +            *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
 +            /* lambda dependence from changing restraint distances */
 +            if (ddp > 0)
 +            {
 +                *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
 +            }
 +            else if (ddp < 0)
 +            {
 +                *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
 +            }
 +            do_dih_fup(ai,aj,ak,al,ddphi,r_ij,r_kj,r_kl,m,n,
 +                       f,fshift,pbc,g,x,t1,t2,t3);            /* 112          */
 +        }
 +    }
 +    return vtot;
 +}
 +
 +
 +real unimplemented(int nbonds,
 +                 const t_iatom forceatoms[],const t_iparams forceparams[],
 +                 const rvec x[],rvec f[],rvec fshift[],
 +                 const t_pbc *pbc,const t_graph *g,
 +                 real lambda,real *dvdlambda,
 +                 const t_mdatoms *md,t_fcdata *fcd,
 +                 int *global_atom_index)
 +{
 +  gmx_impl("*** you are using a not implemented function");
 +
 +  return 0.0; /* To make the compiler happy */
 +}
 +
 +real rbdihs(int nbonds,
 +          const t_iatom forceatoms[],const t_iparams forceparams[],
 +          const rvec x[],rvec f[],rvec fshift[],
 +          const t_pbc *pbc,const t_graph *g,
 +          real lambda,real *dvdlambda,
 +          const t_mdatoms *md,t_fcdata *fcd,
 +          int *global_atom_index)
 +{
 +  const real c0=0.0,c1=1.0,c2=2.0,c3=3.0,c4=4.0,c5=5.0;
 +  int  type,ai,aj,ak,al,i,j;
 +  int  t1,t2,t3;
 +  rvec r_ij,r_kj,r_kl,m,n;
 +  real parmA[NR_RBDIHS];
 +  real parmB[NR_RBDIHS];
 +  real parm[NR_RBDIHS];
 +  real cos_phi,phi,rbp,rbpBA;
 +  real v,sign,ddphi,sin_phi;
 +  real cosfac,vtot;
 +  real L1   = 1.0-lambda;
 +  real dvdl_term=0;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    al   = forceatoms[i++];
 +
 +      phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
 +                    &sign,&t1,&t2,&t3);                       /*  84          */
 +
 +    /* Change to polymer convention */
 +    if (phi < c0)
 +      phi += M_PI;
 +    else
 +      phi -= M_PI;                    /*   1          */
 +      
 +    cos_phi = cos(phi);               
 +    /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 +    sin_phi = sin(phi);
 +
 +    for(j=0; (j<NR_RBDIHS); j++) {
 +      parmA[j] = forceparams[type].rbdihs.rbcA[j];
 +      parmB[j] = forceparams[type].rbdihs.rbcB[j];
 +      parm[j]  = L1*parmA[j]+lambda*parmB[j];
 +    }
 +    /* Calculate cosine powers */
 +    /* Calculate the energy */
 +    /* Calculate the derivative */
 +
 +    v       = parm[0];
 +    dvdl_term   += (parmB[0]-parmA[0]);
 +    ddphi   = c0;
 +    cosfac  = c1;
 +    
 +    rbp     = parm[1];
 +    rbpBA   = parmB[1]-parmA[1];
 +    ddphi  += rbp*cosfac;
 +    cosfac *= cos_phi;
 +    v      += cosfac*rbp;
 +    dvdl_term   += cosfac*rbpBA;
 +    rbp     = parm[2];
 +    rbpBA   = parmB[2]-parmA[2];    
 +    ddphi  += c2*rbp*cosfac;
 +    cosfac *= cos_phi;
 +    v      += cosfac*rbp;
 +    dvdl_term   += cosfac*rbpBA;
 +    rbp     = parm[3];
 +    rbpBA   = parmB[3]-parmA[3];
 +    ddphi  += c3*rbp*cosfac;
 +    cosfac *= cos_phi;
 +    v      += cosfac*rbp;
 +    dvdl_term   += cosfac*rbpBA;
 +    rbp     = parm[4];
 +    rbpBA   = parmB[4]-parmA[4];
 +    ddphi  += c4*rbp*cosfac;
 +    cosfac *= cos_phi;
 +    v      += cosfac*rbp;
 +    dvdl_term   += cosfac*rbpBA;
 +    rbp     = parm[5];
 +    rbpBA   = parmB[5]-parmA[5];
 +    ddphi  += c5*rbp*cosfac;
 +    cosfac *= cos_phi;
 +    v      += cosfac*rbp;
 +    dvdl_term   += cosfac*rbpBA;
 +   
 +    ddphi = -ddphi*sin_phi;                           /*  11          */
 +    
 +    do_dih_fup(ai,aj,ak,al,ddphi,r_ij,r_kj,r_kl,m,n,
 +             f,fshift,pbc,g,x,t1,t2,t3);              /* 112          */
 +    vtot += v;
 +  }  
 +  *dvdlambda += dvdl_term;
 +
 +  return vtot;
 +}
 +
 +int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
 +{
 +      int im1, ip1, ip2;
 +      
 +      if(ip<0)
 +      {
 +              ip = ip + grid_spacing - 1;
 +      }
 +      else if(ip > grid_spacing)
 +      {
 +              ip = ip - grid_spacing - 1;
 +      }
 +      
 +      im1 = ip - 1;
 +      ip1 = ip + 1;
 +      ip2 = ip + 2;
 +      
 +      if(ip == 0)
 +      {
 +              im1 = grid_spacing - 1;
 +      }
 +      else if(ip == grid_spacing-2)
 +      {
 +              ip2 = 0;
 +      }
 +      else if(ip == grid_spacing-1)
 +      {
 +              ip1 = 0;
 +              ip2 = 1;
 +      }
 +      
 +      *ipm1 = im1;
 +      *ipp1 = ip1;
 +      *ipp2 = ip2;
 +      
 +      return ip;
 +      
 +}
 +
 +real cmap_dihs(int nbonds,
 +                         const t_iatom forceatoms[],const t_iparams forceparams[],
 +               const gmx_cmap_t *cmap_grid,
 +                         const rvec x[],rvec f[],rvec fshift[],
 +                         const t_pbc *pbc,const t_graph *g,
 +                         real lambda,real *dvdlambda,
 +                         const t_mdatoms *md,t_fcdata *fcd,
 +                         int *global_atom_index)
 +{
 +      int i,j,k,n,idx;
 +      int ai,aj,ak,al,am;
 +      int a1i,a1j,a1k,a1l,a2i,a2j,a2k,a2l;
 +      int type,cmapA;
 +      int t11,t21,t31,t12,t22,t32;
 +      int iphi1,ip1m1,ip1p1,ip1p2;
 +      int iphi2,ip2m1,ip2p1,ip2p2;
 +      int l1,l2,l3,l4;
 +      int pos1,pos2,pos3,pos4,tmp;
 +      
 +      real ty[4],ty1[4],ty2[4],ty12[4],tc[16],tx[16];
 +      real phi1,psi1,cos_phi1,sin_phi1,sign1,xphi1;
 +      real phi2,psi2,cos_phi2,sin_phi2,sign2,xphi2;
 +      real dx,xx,tt,tu,e,df1,df2,ddf1,ddf2,ddf12,vtot;
 +      real ra21,rb21,rg21,rg1,rgr1,ra2r1,rb2r1,rabr1;
 +      real ra22,rb22,rg22,rg2,rgr2,ra2r2,rb2r2,rabr2;
 +      real fg1,hg1,fga1,hgb1,gaa1,gbb1;
 +      real fg2,hg2,fga2,hgb2,gaa2,gbb2;
 +      real fac;
 +      
 +      rvec r1_ij, r1_kj, r1_kl,m1,n1;
 +      rvec r2_ij, r2_kj, r2_kl,m2,n2;
 +      rvec f1_i,f1_j,f1_k,f1_l;
 +      rvec f2_i,f2_j,f2_k,f2_l;
 +      rvec a1,b1,a2,b2;
 +      rvec f1,g1,h1,f2,g2,h2;
 +      rvec dtf1,dtg1,dth1,dtf2,dtg2,dth2;
 +      ivec jt1,dt1_ij,dt1_kj,dt1_lj;
 +      ivec jt2,dt2_ij,dt2_kj,dt2_lj;
 +
 +    const real *cmapd;
 +
 +      int loop_index[4][4] = {
 +              {0,4,8,12},
 +              {1,5,9,13},
 +              {2,6,10,14},
 +              {3,7,11,15}
 +      };
 +      
 +      /* Total CMAP energy */
 +      vtot = 0;
 +      
 +      for(n=0;n<nbonds; )
 +      {
 +              /* Five atoms are involved in the two torsions */
 +              type   = forceatoms[n++];
 +              ai     = forceatoms[n++];
 +              aj     = forceatoms[n++];
 +              ak     = forceatoms[n++];
 +              al     = forceatoms[n++];
 +              am     = forceatoms[n++];
 +              
 +              /* Which CMAP type is this */
 +              cmapA = forceparams[type].cmap.cmapA;
 +        cmapd = cmap_grid->cmapdata[cmapA].cmap;
 +
 +              /* First torsion */
 +              a1i   = ai;
 +              a1j   = aj;
 +              a1k   = ak;
 +              a1l   = al;
 +              
 +              phi1  = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
 +                                                 &sign1, &t11, &t21, &t31); /* 84 */
 +              
 +        cos_phi1 = cos(phi1);
 +        
 +              a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
 +              a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
 +              a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
 +              
 +              b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
 +              b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
 +              b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
 +              
 +              tmp = pbc_rvec_sub(pbc,x[a1l],x[a1k],h1);
 +              
 +              ra21  = iprod(a1,a1);       /* 5 */
 +              rb21  = iprod(b1,b1);       /* 5 */
 +              rg21  = iprod(r1_kj,r1_kj); /* 5 */
 +              rg1   = sqrt(rg21);
 +              
 +              rgr1  = 1.0/rg1;
 +              ra2r1 = 1.0/ra21;
 +              rb2r1 = 1.0/rb21;
 +              rabr1 = sqrt(ra2r1*rb2r1);
 +              
 +              sin_phi1 = rg1 * rabr1 * iprod(a1,h1) * (-1);
 +              
 +              if(cos_phi1 < -0.5 || cos_phi1 > 0.5)
 +              {
 +                      phi1 = asin(sin_phi1);
 +                      
 +                      if(cos_phi1 < 0)
 +                      {
 +                              if(phi1 > 0)
 +                              {
 +                                      phi1 = M_PI - phi1;
 +                              }
 +                              else
 +                              {
 +                                      phi1 = -M_PI - phi1;
 +                              }
 +                      }
 +              }
 +              else
 +              {
 +                      phi1 = acos(cos_phi1);
 +                      
 +                      if(sin_phi1 < 0)
 +                      {
 +                              phi1 = -phi1;
 +                      }
 +              }
 +              
 +              xphi1 = phi1 + M_PI; /* 1 */
 +              
 +              /* Second torsion */
 +              a2i   = aj;
 +              a2j   = ak;
 +              a2k   = al;
 +              a2l   = am;
 +              
 +              phi2  = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
 +                                                &sign2, &t12, &t22, &t32); /* 84 */
 +              
 +        cos_phi2 = cos(phi2);
 +
 +              a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
 +              a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
 +              a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
 +              
 +              b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
 +              b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
 +              b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
 +              
 +              tmp = pbc_rvec_sub(pbc,x[a2l],x[a2k],h2);
 +              
 +              ra22  = iprod(a2,a2);         /* 5 */
 +              rb22  = iprod(b2,b2);         /* 5 */
 +              rg22  = iprod(r2_kj,r2_kj);   /* 5 */
 +              rg2   = sqrt(rg22);
 +              
 +              rgr2  = 1.0/rg2;
 +              ra2r2 = 1.0/ra22;
 +              rb2r2 = 1.0/rb22;
 +              rabr2 = sqrt(ra2r2*rb2r2);
 +              
 +              sin_phi2 = rg2 * rabr2 * iprod(a2,h2) * (-1);
 +              
 +              if(cos_phi2 < -0.5 || cos_phi2 > 0.5)
 +              {
 +                      phi2 = asin(sin_phi2);
 +                      
 +                      if(cos_phi2 < 0)
 +                      {
 +                              if(phi2 > 0)
 +                              {
 +                                      phi2 = M_PI - phi2;
 +                              }
 +                              else
 +                              {
 +                                      phi2 = -M_PI - phi2;
 +                              }
 +                      }
 +              }
 +              else
 +              {
 +                      phi2 = acos(cos_phi2);
 +                      
 +                      if(sin_phi2 < 0)
 +                      {
 +                              phi2 = -phi2;
 +                      }
 +              }
 +              
 +              xphi2 = phi2 + M_PI; /* 1 */
 +              
 +              /* Range mangling */
 +              if(xphi1<0)
 +              {
 +                      xphi1 = xphi1 + 2*M_PI;
 +              }
 +              else if(xphi1>=2*M_PI)
 +              {
 +                      xphi1 = xphi1 - 2*M_PI;
 +              }
 +              
 +              if(xphi2<0)
 +              {
 +                      xphi2 = xphi2 + 2*M_PI;
 +              }
 +              else if(xphi2>=2*M_PI)
 +              {
 +                      xphi2 = xphi2 - 2*M_PI;
 +              }
 +              
 +              /* Number of grid points */
 +              dx = 2*M_PI / cmap_grid->grid_spacing;
 +              
 +              /* Where on the grid are we */
 +              iphi1 = (int)(xphi1/dx);
 +              iphi2 = (int)(xphi2/dx);
 +              
 +              iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1,&ip1p1,&ip1p2);
 +              iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1,&ip2p1,&ip2p2);
 +              
 +              pos1    = iphi1*cmap_grid->grid_spacing+iphi2;
 +              pos2    = ip1p1*cmap_grid->grid_spacing+iphi2;
 +              pos3    = ip1p1*cmap_grid->grid_spacing+ip2p1;
 +              pos4    = iphi1*cmap_grid->grid_spacing+ip2p1;
 +
 +              ty[0]   = cmapd[pos1*4];
 +              ty[1]   = cmapd[pos2*4];
 +              ty[2]   = cmapd[pos3*4];
 +              ty[3]   = cmapd[pos4*4];
 +              
 +              ty1[0]   = cmapd[pos1*4+1];
 +              ty1[1]   = cmapd[pos2*4+1];
 +              ty1[2]   = cmapd[pos3*4+1];
 +              ty1[3]   = cmapd[pos4*4+1];
 +              
 +              ty2[0]   = cmapd[pos1*4+2];
 +              ty2[1]   = cmapd[pos2*4+2];
 +              ty2[2]   = cmapd[pos3*4+2];
 +              ty2[3]   = cmapd[pos4*4+2];
 +              
 +              ty12[0]   = cmapd[pos1*4+3];
 +              ty12[1]   = cmapd[pos2*4+3];
 +              ty12[2]   = cmapd[pos3*4+3];
 +              ty12[3]   = cmapd[pos4*4+3];
 +              
 +              /* Switch to degrees */
 +              dx = 360.0 / cmap_grid->grid_spacing;
 +              xphi1 = xphi1 * RAD2DEG;
 +              xphi2 = xphi2 * RAD2DEG; 
 +              
 +              for(i=0;i<4;i++) /* 16 */
 +              {
 +                      tx[i] = ty[i];
 +                      tx[i+4] = ty1[i]*dx;
 +                      tx[i+8] = ty2[i]*dx;
 +                      tx[i+12] = ty12[i]*dx*dx;
 +              }
 +              
 +              idx=0;
 +              for(i=0;i<4;i++) /* 1056 */
 +              {
 +                      for(j=0;j<4;j++)
 +                      {
 +                              xx = 0;
 +                              for(k=0;k<16;k++)
 +                              {
 +                                      xx = xx + cmap_coeff_matrix[k*16+idx]*tx[k];
 +                              }
 +                              
 +                              idx++;
 +                              tc[i*4+j]=xx;
 +                      }
 +              }
 +              
 +              tt    = (xphi1-iphi1*dx)/dx;
 +              tu    = (xphi2-iphi2*dx)/dx;
 +              
 +              e     = 0;
 +              df1   = 0;
 +              df2   = 0;
 +              ddf1  = 0;
 +              ddf2  = 0;
 +              ddf12 = 0;
 +              
 +              for(i=3;i>=0;i--)
 +              {
 +                      l1 = loop_index[i][3];
 +                      l2 = loop_index[i][2];
 +                      l3 = loop_index[i][1];
 +                      
 +                      e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
 +                      df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
 +                      df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
 +                      ddf1  = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
 +                      ddf2  = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
 +              }
 +              
 +              ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
 +              3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
 +              
 +              fac     = RAD2DEG/dx;
 +              df1     = df1   * fac;
 +              df2     = df2   * fac;
 +              ddf1    = ddf1  * fac * fac;
 +              ddf2    = ddf2  * fac * fac;
 +              ddf12   = ddf12 * fac * fac;
 +              
 +              /* CMAP energy */
 +              vtot += e;
 +              
 +              /* Do forces - first torsion */
 +              fg1       = iprod(r1_ij,r1_kj);
 +              hg1       = iprod(r1_kl,r1_kj);
 +              fga1      = fg1*ra2r1*rgr1;
 +              hgb1      = hg1*rb2r1*rgr1;
 +              gaa1      = -ra2r1*rg1;
 +              gbb1      = rb2r1*rg1;
 +              
 +              for(i=0;i<DIM;i++)
 +              {
 +                      dtf1[i]   = gaa1 * a1[i];
 +                      dtg1[i]   = fga1 * a1[i] - hgb1 * b1[i];
 +                      dth1[i]   = gbb1 * b1[i];
 +                      
 +                      f1[i]     = df1  * dtf1[i];
 +                      g1[i]     = df1  * dtg1[i];
 +                      h1[i]     = df1  * dth1[i];
 +                      
 +                      f1_i[i]   =  f1[i];
 +                      f1_j[i]   = -f1[i] - g1[i];
 +                      f1_k[i]   =  h1[i] + g1[i];
 +                      f1_l[i]   = -h1[i];
 +                      
 +                      f[a1i][i] = f[a1i][i] + f1_i[i];
 +                      f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */                                                            
 +                      f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */                                                            
 +                      f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */                                                                       
 +              }
 +              
 +              /* Do forces - second torsion */
 +              fg2       = iprod(r2_ij,r2_kj);
 +              hg2       = iprod(r2_kl,r2_kj);
 +              fga2      = fg2*ra2r2*rgr2;
 +              hgb2      = hg2*rb2r2*rgr2;
 +              gaa2      = -ra2r2*rg2;
 +              gbb2      = rb2r2*rg2;
 +              
 +              for(i=0;i<DIM;i++)
 +              {
 +                      dtf2[i]   = gaa2 * a2[i];
 +                      dtg2[i]   = fga2 * a2[i] - hgb2 * b2[i];
 +                      dth2[i]   = gbb2 * b2[i];
 +                      
 +                      f2[i]     = df2  * dtf2[i];
 +                      g2[i]     = df2  * dtg2[i];
 +                      h2[i]     = df2  * dth2[i];
 +                      
 +                      f2_i[i]   =  f2[i];
 +                      f2_j[i]   = -f2[i] - g2[i];
 +                      f2_k[i]   =  h2[i] + g2[i];
 +                      f2_l[i]   = -h2[i];
 +                      
 +                      f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */                                                                        
 +                      f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */                                                              
 +                      f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */                            
 +                      f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */                                                                      
 +              }
 +              
 +              /* Shift forces */
 +              if(g)
 +              {
 +                      copy_ivec(SHIFT_IVEC(g,a1j), jt1);
 +                      ivec_sub(SHIFT_IVEC(g,a1i),  jt1,dt1_ij);
 +                      ivec_sub(SHIFT_IVEC(g,a1k),  jt1,dt1_kj);
 +                      ivec_sub(SHIFT_IVEC(g,a1l),  jt1,dt1_lj);
 +                      t11 = IVEC2IS(dt1_ij);
 +                      t21 = IVEC2IS(dt1_kj);
 +                      t31 = IVEC2IS(dt1_lj);
 +                      
 +                      copy_ivec(SHIFT_IVEC(g,a2j), jt2);
 +                      ivec_sub(SHIFT_IVEC(g,a2i),  jt2,dt2_ij);
 +                      ivec_sub(SHIFT_IVEC(g,a2k),  jt2,dt2_kj);
 +                      ivec_sub(SHIFT_IVEC(g,a2l),  jt2,dt2_lj);
 +                      t12 = IVEC2IS(dt2_ij);
 +                      t22 = IVEC2IS(dt2_kj);
 +                      t32 = IVEC2IS(dt2_lj);
 +              }
 +              else if(pbc)
 +              {
 +                      t31 = pbc_rvec_sub(pbc,x[a1l],x[a1j],h1);
 +                      t32 = pbc_rvec_sub(pbc,x[a2l],x[a2j],h2);
 +              }
 +              else
 +              {
 +                      t31 = CENTRAL;
 +                      t32 = CENTRAL;
 +              }
 +              
 +              rvec_inc(fshift[t11],f1_i);
 +              rvec_inc(fshift[CENTRAL],f1_j);
 +              rvec_inc(fshift[t21],f1_k);
 +              rvec_inc(fshift[t31],f1_l);
 +              
 +              rvec_inc(fshift[t21],f2_i);
 +              rvec_inc(fshift[CENTRAL],f2_j);
 +              rvec_inc(fshift[t22],f2_k);
 +              rvec_inc(fshift[t32],f2_l);
 +      }       
 +      return vtot;
 +}
 +
 +
 +
 +/***********************************************************
 + *
 + *   G R O M O S  9 6   F U N C T I O N S
 + *
 + ***********************************************************/
 +real g96harmonic(real kA,real kB,real xA,real xB,real x,real lambda,
 +               real *V,real *F)
 +{
 +  const real half=0.5;
 +  real  L1,kk,x0,dx,dx2;
 +  real  v,f,dvdlambda;
 +  
 +  L1    = 1.0-lambda;
 +  kk    = L1*kA+lambda*kB;
 +  x0    = L1*xA+lambda*xB;
 +  
 +  dx    = x-x0;
 +  dx2   = dx*dx;
 +  
 +  f     = -kk*dx;
 +  v     = half*kk*dx2;
 +  dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
 +  
 +  *F    = f;
 +  *V    = v;
 +  
 +  return dvdlambda;
 +  
 +  /* That was 21 flops */
 +}
 +
 +real g96bonds(int nbonds,
 +            const t_iatom forceatoms[],const t_iparams forceparams[],
 +            const rvec x[],rvec f[],rvec fshift[],
 +            const t_pbc *pbc,const t_graph *g,
 +            real lambda,real *dvdlambda,
 +            const t_mdatoms *md,t_fcdata *fcd,
 +            int *global_atom_index)
 +{
 +  int  i,m,ki,ai,aj,type;
 +  real dr2,fbond,vbond,fij,vtot;
 +  rvec dx;
 +  ivec dt;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +  
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);          /*   3          */
 +    dr2  = iprod(dx,dx);                              /*   5          */
 +      
 +    *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
 +                              forceparams[type].harmonic.krB,
 +                              forceparams[type].harmonic.rA,
 +                              forceparams[type].harmonic.rB,
 +                              dr2,lambda,&vbond,&fbond);
 +
 +    vtot  += 0.5*vbond;                             /* 1*/
 +#ifdef DEBUG
 +    if (debug)
 +      fprintf(debug,"G96-BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +            sqrt(dr2),vbond,fbond);
 +#endif
 +   
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                 /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                   /* 44 TOTAL     */
 +  return vtot;
 +}
 +
 +real g96bond_angle(const rvec xi,const rvec xj,const rvec xk,const t_pbc *pbc,
 +                 rvec r_ij,rvec r_kj,
 +                 int *t1,int *t2)
 +/* Return value is the angle between the bonds i-j and j-k */
 +{
 +  real costh;
 +  
 +  *t1 = pbc_rvec_sub(pbc,xi,xj,r_ij);                 /*  3           */
 +  *t2 = pbc_rvec_sub(pbc,xk,xj,r_kj);                 /*  3           */
 +
 +  costh=cos_angle(r_ij,r_kj);                 /* 25           */
 +                                      /* 41 TOTAL     */
 +  return costh;
 +}
 +
 +real g96angles(int nbonds,
 +             const t_iatom forceatoms[],const t_iparams forceparams[],
 +             const rvec x[],rvec f[],rvec fshift[],
 +             const t_pbc *pbc,const t_graph *g,
 +             real lambda,real *dvdlambda,
 +             const t_mdatoms *md,t_fcdata *fcd,
 +             int *global_atom_index)
 +{
 +  int  i,ai,aj,ak,type,m,t1,t2;
 +  rvec r_ij,r_kj;
 +  real cos_theta,dVdt,va,vtot;
 +  real rij_1,rij_2,rkj_1,rkj_2,rijrkj_1;
 +  rvec f_i,f_j,f_k;
 +  ivec jt,dt_ij,dt_kj;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    
 +    cos_theta  = g96bond_angle(x[ai],x[aj],x[ak],pbc,r_ij,r_kj,&t1,&t2);
 +
 +    *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
 +                              forceparams[type].harmonic.krB,
 +                              forceparams[type].harmonic.rA,
 +                              forceparams[type].harmonic.rB,
 +                              cos_theta,lambda,&va,&dVdt);
 +    vtot    += va;
 +    
 +    rij_1    = gmx_invsqrt(iprod(r_ij,r_ij));
 +    rkj_1    = gmx_invsqrt(iprod(r_kj,r_kj));
 +    rij_2    = rij_1*rij_1;
 +    rkj_2    = rkj_1*rkj_1;
 +    rijrkj_1 = rij_1*rkj_1;                     /* 23 */
 +    
 +#ifdef DEBUG
 +    if (debug)
 +      fprintf(debug,"G96ANGLES: costheta = %10g  vth = %10g  dV/dct = %10g\n",
 +            cos_theta,va,dVdt);
 +#endif
 +    for (m=0; (m<DIM); m++) {                 /*  42  */
 +      f_i[m]=dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
 +      f_k[m]=dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
 +      f_j[m]=-f_i[m]-f_k[m];
 +      f[ai][m]+=f_i[m];
 +      f[aj][m]+=f_j[m];
 +      f[ak][m]+=f_k[m];
 +    }
 +    
 +    if (g) {
 +      copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +      ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +    }      
 +    rvec_inc(fshift[t1],f_i);
 +    rvec_inc(fshift[CENTRAL],f_j);
 +    rvec_inc(fshift[t2],f_k);               /* 9 */
 +    /* 163 TOTAL      */
 +  }
 +  return vtot;
 +}
 +
 +real cross_bond_bond(int nbonds,
 +                   const t_iatom forceatoms[],const t_iparams forceparams[],
 +                   const rvec x[],rvec f[],rvec fshift[],
 +                   const t_pbc *pbc,const t_graph *g,
 +                   real lambda,real *dvdlambda,
 +                   const t_mdatoms *md,t_fcdata *fcd,
 +                   int *global_atom_index)
 +{
 +  /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
 +   * pp. 842-847
 +   */
 +  int  i,ai,aj,ak,type,m,t1,t2;
 +  rvec r_ij,r_kj;
 +  real vtot,vrr,s1,s2,r1,r2,r1e,r2e,krr;
 +  rvec f_i,f_j,f_k;
 +  ivec jt,dt_ij,dt_kj;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    r1e  = forceparams[type].cross_bb.r1e;
 +    r2e  = forceparams[type].cross_bb.r2e;
 +    krr  = forceparams[type].cross_bb.krr;
 +    
 +    /* Compute distance vectors ... */
 +    t1 = pbc_rvec_sub(pbc,x[ai],x[aj],r_ij);
 +    t2 = pbc_rvec_sub(pbc,x[ak],x[aj],r_kj);
 +    
 +    /* ... and their lengths */
 +    r1 = norm(r_ij);
 +    r2 = norm(r_kj);
 +    
 +    /* Deviations from ideality */
 +    s1 = r1-r1e;
 +    s2 = r2-r2e;
 +    
 +    /* Energy (can be negative!) */
 +    vrr   = krr*s1*s2;
 +    vtot += vrr;
 +    
 +    /* Forces */
 +    svmul(-krr*s2/r1,r_ij,f_i);
 +    svmul(-krr*s1/r2,r_kj,f_k);
 +    
 +    for (m=0; (m<DIM); m++) {                 /*  12  */
 +      f_j[m]    = -f_i[m] - f_k[m];
 +      f[ai][m] += f_i[m];
 +      f[aj][m] += f_j[m];
 +      f[ak][m] += f_k[m];
 +    }
 +    
 +    /* Virial stuff */
 +    if (g) {
 +      copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +      ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +    }      
 +    rvec_inc(fshift[t1],f_i);
 +    rvec_inc(fshift[CENTRAL],f_j);
 +    rvec_inc(fshift[t2],f_k);               /* 9 */
 +    /* 163 TOTAL      */
 +  }
 +  return vtot;
 +}
 +
 +real cross_bond_angle(int nbonds,
 +                    const t_iatom forceatoms[],const t_iparams forceparams[],
 +                    const rvec x[],rvec f[],rvec fshift[],
 +                    const t_pbc *pbc,const t_graph *g,
 +                    real lambda,real *dvdlambda,
 +                    const t_mdatoms *md,t_fcdata *fcd,
 +                    int *global_atom_index)
 +{
 +  /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
 +   * pp. 842-847
 +   */
 +  int  i,ai,aj,ak,type,m,t1,t2,t3;
 +  rvec r_ij,r_kj,r_ik;
 +  real vtot,vrt,s1,s2,s3,r1,r2,r3,r1e,r2e,r3e,krt,k1,k2,k3;
 +  rvec f_i,f_j,f_k;
 +  ivec jt,dt_ij,dt_kj;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    r1e  = forceparams[type].cross_ba.r1e;
 +    r2e  = forceparams[type].cross_ba.r2e;
 +    r3e  = forceparams[type].cross_ba.r3e;
 +    krt  = forceparams[type].cross_ba.krt;
 +    
 +    /* Compute distance vectors ... */
 +    t1 = pbc_rvec_sub(pbc,x[ai],x[aj],r_ij);
 +    t2 = pbc_rvec_sub(pbc,x[ak],x[aj],r_kj);
 +    t3 = pbc_rvec_sub(pbc,x[ai],x[ak],r_ik);
 +    
 +    /* ... and their lengths */
 +    r1 = norm(r_ij);
 +    r2 = norm(r_kj);
 +    r3 = norm(r_ik);
 +    
 +    /* Deviations from ideality */
 +    s1 = r1-r1e;
 +    s2 = r2-r2e;
 +    s3 = r3-r3e;
 +    
 +    /* Energy (can be negative!) */
 +    vrt   = krt*s3*(s1+s2);
 +    vtot += vrt;
 +    
 +    /* Forces */
 +    k1 = -krt*(s3/r1);
 +    k2 = -krt*(s3/r2);
 +    k3 = -krt*(s1+s2)/r3;
 +    for(m=0; (m<DIM); m++) {
 +      f_i[m] = k1*r_ij[m] + k3*r_ik[m];
 +      f_k[m] = k2*r_kj[m] - k3*r_ik[m];
 +      f_j[m] = -f_i[m] - f_k[m];
 +    }
 +    
 +    for (m=0; (m<DIM); m++) {                 /*  12  */
 +      f[ai][m] += f_i[m];
 +      f[aj][m] += f_j[m];
 +      f[ak][m] += f_k[m];
 +    }
 +    
 +    /* Virial stuff */
 +    if (g) {
 +      copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +      ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +    }      
 +    rvec_inc(fshift[t1],f_i);
 +    rvec_inc(fshift[CENTRAL],f_j);
 +    rvec_inc(fshift[t2],f_k);               /* 9 */
 +    /* 163 TOTAL      */
 +  }
 +  return vtot;
 +}
 +
 +static real bonded_tab(const char *type,int table_nr,
 +                     const bondedtable_t *table,real kA,real kB,real r,
 +                     real lambda,real *V,real *F)
 +{
 +  real k,tabscale,*VFtab,rt,eps,eps2,Yt,Ft,Geps,Heps2,Fp,VV,FF;
 +  int  n0,nnn;
 +  real v,f,dvdlambda;
 +
 +  k = (1.0 - lambda)*kA + lambda*kB;
 +
 +  tabscale = table->scale;
 +  VFtab    = table->tab;
 +  
 +  rt    = r*tabscale;
 +  n0    = rt;
 +  if (n0 >= table->n) {
 +    gmx_fatal(FARGS,"A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
 +            type,table_nr,r,n0,n0+1,table->n);
 +  }
 +  eps   = rt - n0;
 +  eps2  = eps*eps;
 +  nnn   = 4*n0;
 +  Yt    = VFtab[nnn];
 +  Ft    = VFtab[nnn+1];
 +  Geps  = VFtab[nnn+2]*eps;
 +  Heps2 = VFtab[nnn+3]*eps2;
 +  Fp    = Ft + Geps + Heps2;
 +  VV    = Yt + Fp*eps;
 +  FF    = Fp + Geps + 2.0*Heps2;
 +  
 +  *F    = -k*FF*tabscale;
 +  *V    = k*VV;
 +  dvdlambda  = (kB - kA)*VV;
 +  
 +  return dvdlambda;
 +  
 +  /* That was 22 flops */
 +}
 +
 +real tab_bonds(int nbonds,
 +             const t_iatom forceatoms[],const t_iparams forceparams[],
 +             const rvec x[],rvec f[],rvec fshift[],
 +             const t_pbc *pbc,const t_graph *g,
 +             real lambda,real *dvdlambda,
 +             const t_mdatoms *md,t_fcdata *fcd,
 +             int *global_atom_index)
 +{
 +  int  i,m,ki,ai,aj,type,table;
 +  real dr,dr2,fbond,vbond,fij,vtot;
 +  rvec dx;
 +  ivec dt;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +  
 +    ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);  /*   3          */
 +    dr2  = iprod(dx,dx);                      /*   5          */
 +    dr   = dr2*gmx_invsqrt(dr2);                      /*  10          */
 +
 +    table = forceparams[type].tab.table;
 +
 +    *dvdlambda += bonded_tab("bond",table,
 +                             &fcd->bondtab[table],
 +                             forceparams[type].tab.kA,
 +                             forceparams[type].tab.kB,
 +                             dr,lambda,&vbond,&fbond);  /*  22 */
 +
 +    if (dr2 == 0.0)
 +      continue;
 +
 +    
 +    vtot  += vbond;/* 1*/
 +    fbond *= gmx_invsqrt(dr2);                        /*   6          */
 +#ifdef DEBUG
 +    if (debug)
 +      fprintf(debug,"TABBONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
 +            dr,vbond,fbond);
 +#endif
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +      ki=IVEC2IS(dt);
 +    }
 +    for (m=0; (m<DIM); m++) {                 /*  15          */
 +      fij=fbond*dx[m];
 +      f[ai][m]+=fij;
 +      f[aj][m]-=fij;
 +      fshift[ki][m]+=fij;
 +      fshift[CENTRAL][m]-=fij;
 +    }
 +  }                                   /* 62 TOTAL     */
 +  return vtot;
 +}
 +
 +real tab_angles(int nbonds,
 +              const t_iatom forceatoms[],const t_iparams forceparams[],
 +              const rvec x[],rvec f[],rvec fshift[],
 +              const t_pbc *pbc,const t_graph *g,
 +              real lambda,real *dvdlambda,
 +              const t_mdatoms *md,t_fcdata *fcd,
 +              int *global_atom_index)
 +{
 +  int  i,ai,aj,ak,t1,t2,type,table;
 +  rvec r_ij,r_kj;
 +  real cos_theta,cos_theta2,theta,dVdt,va,vtot;
 +  ivec jt,dt_ij,dt_kj;
 +  
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    
 +    theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
 +                      r_ij,r_kj,&cos_theta,&t1,&t2);  /*  41          */
 +
 +    table = forceparams[type].tab.table;
 +  
 +    *dvdlambda += bonded_tab("angle",table,
 +                             &fcd->angletab[table],
 +                             forceparams[type].tab.kA,
 +                             forceparams[type].tab.kB,
 +                             theta,lambda,&va,&dVdt);  /*  22  */
 +    vtot += va;
 +    
 +    cos_theta2 = sqr(cos_theta);                /*   1                */
 +    if (cos_theta2 < 1) {
 +      int  m;
 +      real snt,st,sth;
 +      real cik,cii,ckk;
 +      real nrkj2,nrij2;
 +      rvec f_i,f_j,f_k;
 +      
 +      st  = dVdt*gmx_invsqrt(1 - cos_theta2); /*  12          */
 +      sth = st*cos_theta;                     /*   1          */
 +#ifdef DEBUG
 +      if (debug)
 +      fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
 +              theta*RAD2DEG,va,dVdt);
 +#endif
 +      nrkj2=iprod(r_kj,r_kj);                 /*   5          */
 +      nrij2=iprod(r_ij,r_ij);
 +      
 +      cik=st*gmx_invsqrt(nrkj2*nrij2);                /*  12          */ 
 +      cii=sth/nrij2;                          /*  10          */
 +      ckk=sth/nrkj2;                          /*  10          */
 +      
 +      for (m=0; (m<DIM); m++) {                       /*  39          */
 +      f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
 +      f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
 +      f_j[m]=-f_i[m]-f_k[m];
 +      f[ai][m]+=f_i[m];
 +      f[aj][m]+=f_j[m];
 +      f[ak][m]+=f_k[m];
 +      }
 +      if (g) {
 +      copy_ivec(SHIFT_IVEC(g,aj),jt);
 +      
 +      ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
 +      ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
 +      t1=IVEC2IS(dt_ij);
 +      t2=IVEC2IS(dt_kj);
 +      }
 +      rvec_inc(fshift[t1],f_i);
 +      rvec_inc(fshift[CENTRAL],f_j);
 +      rvec_inc(fshift[t2],f_k);
 +    }                                           /* 169 TOTAL  */
 +  }
 +  return vtot;
 +}
 +
 +real tab_dihs(int nbonds,
 +            const t_iatom forceatoms[],const t_iparams forceparams[],
 +            const rvec x[],rvec f[],rvec fshift[],
 +            const t_pbc *pbc,const t_graph *g,
 +            real lambda,real *dvdlambda,
 +            const t_mdatoms *md,t_fcdata *fcd,
 +            int *global_atom_index)
 +{
 +  int  i,type,ai,aj,ak,al,table;
 +  int  t1,t2,t3;
 +  rvec r_ij,r_kj,r_kl,m,n;
 +  real phi,sign,ddphi,vpd,vtot;
 +
 +  vtot = 0.0;
 +  for(i=0; (i<nbonds); ) {
 +    type = forceatoms[i++];
 +    ai   = forceatoms[i++];
 +    aj   = forceatoms[i++];
 +    ak   = forceatoms[i++];
 +    al   = forceatoms[i++];
 +    
 +    phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
 +                  &sign,&t1,&t2,&t3);                 /*  84  */
 +
 +    table = forceparams[type].tab.table;
 +
 +    /* Hopefully phi+M_PI never results in values < 0 */
 +    *dvdlambda += bonded_tab("dihedral",table,
 +                             &fcd->dihtab[table],
 +                             forceparams[type].tab.kA,
 +                             forceparams[type].tab.kB,
 +                             phi+M_PI,lambda,&vpd,&ddphi);
 +                     
 +    vtot += vpd;
 +    do_dih_fup(ai,aj,ak,al,-ddphi,r_ij,r_kj,r_kl,m,n,
 +             f,fshift,pbc,g,x,t1,t2,t3);                      /* 112  */
 +
 +#ifdef DEBUG
 +    fprintf(debug,"pdih: (%d,%d,%d,%d) phi=%g\n",
 +          ai,aj,ak,al,phi);
 +#endif
 +  } /* 227 TOTAL      */
 +
 +  return vtot;
 +}
 +
 +static unsigned
 +calc_bonded_reduction_mask(const t_idef *idef,
 +                           int shift,
 +                           int t,int nt)
 +{
 +    unsigned mask;
 +    int ftype,nb,nat1,nb0,nb1,i,a;
 +
 +    mask = 0;
 +
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        if (interaction_function[ftype].flags & IF_BOND &&
 +            !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
 +            (ftype<F_GB12 || ftype>F_GB14))
 +        {
 +            nb = idef->il[ftype].nr;
 +            if (nb > 0)
 +            {
 +                nat1 = interaction_function[ftype].nratoms + 1;
 +
 +                /* Divide this interaction equally over the threads.
 +                 * This is not stored: should match division in calc_bonds.
 +                 */
 +                nb0 = (((nb/nat1)* t   )/nt)*nat1;
 +                nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
 +
 +                for(i=nb0; i<nb1; i+=nat1)
 +                {
 +                    for(a=1; a<nat1; a++)
 +                    {
 +                        mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return mask;
 +}
 +
 +void init_bonded_thread_force_reduction(t_forcerec *fr,
 +                                        const t_idef *idef)
 +{
 +#define MAX_BLOCK_BITS 32
 +    int t;
 +    int ctot,c,b;
 +
 +    if (fr->nthreads <= 1)
 +    {
 +        fr->red_nblock = 0;
 +
 +        return;
 +    }
 +
 +    /* We divide the force array in a maximum of 32 blocks.
 +     * Minimum force block reduction size is 2^6=64.
 +     */
 +    fr->red_ashift = 6;
 +    while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
 +    {
 +        fr->red_ashift++;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"bonded force buffer block atom shift %d bits\n",
 +                fr->red_ashift);
 +    }
 +
 +    /* Determine to which blocks each thread's bonded force calculation
 +     * contributes. Store this is a mask for each thread.
 +     */
 +#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 +    for(t=1; t<fr->nthreads; t++)
 +    {
 +        fr->f_t[t].red_mask =
 +            calc_bonded_reduction_mask(idef,fr->red_ashift,t,fr->nthreads);
 +    }
 +
 +    /* Determine the maximum number of blocks we need to reduce over */
 +    fr->red_nblock = 0;
 +    ctot = 0;
 +    for(t=0; t<fr->nthreads; t++)
 +    {
 +        c = 0;
 +        for(b=0; b<MAX_BLOCK_BITS; b++)
 +        {
 +            if (fr->f_t[t].red_mask & (1U<<b))
 +            {
 +                fr->red_nblock = max(fr->red_nblock,b+1);
 +                c++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"thread %d flags %x count %d\n",
 +                    t,fr->f_t[t].red_mask,c);
 +        }
 +        ctot += c;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Number of blocks to reduce: %d of size %d\n",
 +                fr->red_nblock,1<<fr->red_ashift);
 +        fprintf(debug,"Reduction density %.2f density/#thread %.2f\n",
 +                ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
 +                ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
 +    }
 +}
 +
 +static void zero_thread_forces(f_thread_t *f_t,int n,
 +                               int nblock,int blocksize)
 +{
 +    int b,a0,a1,a,i,j;
 +
 +    if (n > f_t->f_nalloc)
 +    {
 +        f_t->f_nalloc = over_alloc_large(n);
 +        srenew(f_t->f,f_t->f_nalloc);
 +    }
 +
 +    if (f_t->red_mask != 0)
 +    {
 +        for(b=0; b<nblock; b++)
 +        {
 +            if (f_t->red_mask && (1U<<b))
 +            {
 +                a0 = b*blocksize;
 +                a1 = min((b+1)*blocksize,n);
 +                for(a=a0; a<a1; a++)
 +                {
 +                    clear_rvec(f_t->f[a]);
 +                }
 +            }
 +        }
 +    }
 +    for(i=0; i<SHIFTS; i++)
 +    {
 +        clear_rvec(f_t->fshift[i]);
 +    }
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        f_t->ener[i] = 0;
 +    }
 +    for(i=0; i<egNR; i++)
 +    {
 +        for(j=0; j<f_t->grpp.nener; j++)
 +        {
 +            f_t->grpp.ener[i][j] = 0;
 +        }
 +    }
 +    for(i=0; i<efptNR; i++)
 +    {
 +        f_t->dvdl[i] = 0;
 +    }
 +}
 +
 +static void reduce_thread_force_buffer(int n,rvec *f,
 +                                       int nthreads,f_thread_t *f_t,
 +                                       int nblock,int block_size)
 +{
 +    /* The max thread number is arbitrary,
 +     * we used a fixed number to avoid memory management.
 +     * Using more than 16 threads is probably never useful performance wise.
 +     */
 +#define MAX_BONDED_THREADS 256
 +    int b;
 +
 +    if (nthreads > MAX_BONDED_THREADS)
 +    {
 +        gmx_fatal(FARGS,"Can not reduce bonded forces on more than %d threads",
 +                  MAX_BONDED_THREADS);
 +    }
 +
 +    /* This reduction can run on any number of threads,
 +     * independently of nthreads.
 +     */
 +#pragma omp parallel for num_threads(nthreads) schedule(static)
 +    for(b=0; b<nblock; b++)
 +    {
 +        rvec *fp[MAX_BONDED_THREADS];
 +        int nfb,ft,fb;
 +        int a0,a1,a;
 +
 +        /* Determine which threads contribute to this block */
 +        nfb = 0;
 +        for(ft=1; ft<nthreads; ft++)
 +        {
 +            if (f_t[ft].red_mask & (1U<<b))
 +            {
 +                fp[nfb++] = f_t[ft].f;
 +            }
 +        }
 +        if (nfb > 0)
 +        {
 +            /* Reduce force buffers for threads that contribute */
 +            a0 =  b   *block_size;
 +            a1 = (b+1)*block_size;
 +            a1 = min(a1,n);
 +            for(a=a0; a<a1; a++)
 +            {
 +                for(fb=0; fb<nfb; fb++)
 +                {
 +                    rvec_inc(f[a],fp[fb][a]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void reduce_thread_forces(int n,rvec *f,rvec *fshift,
 +                                 real *ener,gmx_grppairener_t *grpp,real *dvdl,
 +                                 int nthreads,f_thread_t *f_t,
 +                                 int nblock,int block_size,
 +                                 gmx_bool bCalcEnerVir,
 +                                 gmx_bool bDHDL)
 +{
 +    if (nblock > 0)
 +    {
 +        /* Reduce the bonded force buffer */
 +        reduce_thread_force_buffer(n,f,nthreads,f_t,nblock,block_size);
 +    }
 +
 +    /* When necessary, reduce energy and virial using one thread only */
 +    if (bCalcEnerVir)
 +    {
 +        int t,i,j;
 +
 +        for(i=0; i<SHIFTS; i++)
 +        {
 +            for(t=1; t<nthreads; t++)
 +            {
 +                rvec_inc(fshift[i],f_t[t].fshift[i]);
 +            }
 +        }
 +        for(i=0; i<F_NRE; i++)
 +        {
 +            for(t=1; t<nthreads; t++)
 +            {
 +                ener[i] += f_t[t].ener[i];
 +            }
 +        }
 +        for(i=0; i<egNR; i++)
 +        {
 +            for(j=0; j<f_t[1].grpp.nener; j++)
 +            {
 +                for(t=1; t<nthreads; t++)
 +                {
 +                    
 +                    grpp->ener[i][j] += f_t[t].grpp.ener[i][j];
 +                }
 +            }
 +        }
 +        if (bDHDL)
 +        {
 +            for(i=0; i<efptNR; i++)
 +            {
 +                
 +                for(t=1; t<nthreads; t++)
 +                {
 +                    dvdl[i] += f_t[t].dvdl[i];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static real calc_one_bond(FILE *fplog,int thread,
 +                          int ftype,const t_idef *idef,
 +                          rvec x[], rvec f[], rvec fshift[],
 +                          t_forcerec *fr,
 +                          const t_pbc *pbc,const t_graph *g,
 +                          gmx_enerdata_t *enerd, gmx_grppairener_t *grpp,
 +                          t_nrnb *nrnb,
 +                          real *lambda, real *dvdl,
 +                          const t_mdatoms *md,t_fcdata *fcd,
 +                          gmx_bool bCalcEnerVir,
 +                          int *global_atom_index, gmx_bool bPrintSepPot)
 +{
 +    int ind,nat1,nbonds,efptFTYPE;
 +    real v=0;
 +    t_iatom *iatoms;
 +    int nb0,nbn;
 +
 +    if (IS_RESTRAINT_TYPE(ftype))
 +    {
 +        efptFTYPE = efptRESTRAINT;
 +    }
 +    else
 +    {
 +        efptFTYPE = efptBONDED;
 +    }
 +
 +    if (interaction_function[ftype].flags & IF_BOND &&
 +        !(ftype == F_CONNBONDS || ftype == F_POSRES))
 +    {
 +        ind  = interaction_function[ftype].nrnb_ind;
 +        nat1 = interaction_function[ftype].nratoms + 1;
 +        nbonds    = idef->il[ftype].nr/nat1;
 +        iatoms    = idef->il[ftype].iatoms;
 +
 +        nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
 +        nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
 +
 +        if (!IS_LISTED_LJ_C(ftype))
 +        {
 +            if(ftype==F_CMAP)
 +            {
 +                v = cmap_dihs(nbn,iatoms+nb0,
 +                              idef->iparams,&idef->cmap_grid,
 +                              (const rvec*)x,f,fshift,
 +                              pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
 +                              md,fcd,global_atom_index);
 +            }
 +            else if (ftype == F_PDIHS &&
 +                     !bCalcEnerVir && fr->efep==efepNO)
 +            {
 +                /* No energies, shift forces, dvdl */
 +#ifndef SSE_PROPER_DIHEDRALS
 +                pdihs_noener
 +#else
 +                pdihs_noener_sse
 +#endif
 +                    (nbn,idef->il[ftype].iatoms+nb0,
 +                     idef->iparams,
 +                     (const rvec*)x,f,
 +                     pbc,g,lambda[efptFTYPE],md,fcd,
 +                     global_atom_index);
 +                v = 0;
-             enerd->dvdl_nonlin[efptFTYPE] += dvdl[efptFTYPE];
 +            }
 +            else
 +            {
 +                v = interaction_function[ftype].ifunc(nbn,iatoms+nb0,
 +                                                      idef->iparams,
 +                                                      (const rvec*)x,f,fshift,
 +                                                      pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
 +                                                      md,fcd,global_atom_index);
 +            }
-             enerd->dvdl_nonlin[efptCOUL] += dvdl[efptCOUL];
-             enerd->dvdl_nonlin[efptVDW] += dvdl[efptVDW];
 +            if (bPrintSepPot)
 +            {
 +                fprintf(fplog,"  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
 +                        interaction_function[ftype].longname,
 +                        nbonds/nat1,v,lambda[efptFTYPE]);
 +            }
 +        }
 +        else
 +        {
 +            v = do_listed_vdw_q(ftype,nbn,iatoms+nb0,
 +                                idef->iparams,
 +                                (const rvec*)x,f,fshift,
 +                                pbc,g,lambda,dvdl,
 +                                md,fr,grpp,global_atom_index);
 +            
 +            if (bPrintSepPot)
 +            {
 +                fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
 +                        interaction_function[ftype].longname,
 +                        interaction_function[F_LJ14].longname,nbonds/nat1,dvdl[efptVDW]);
 +                fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
 +                        interaction_function[ftype].longname,
 +                        interaction_function[F_COUL14].longname,nbonds/nat1,dvdl[efptCOUL]);
 +            }
 +        }
 +        if (ind != -1 && thread == 0)
 +        {
 +            inc_nrnb(nrnb,ind,nbonds);
 +        }
 +    }
 +
 +    return v;
 +}
 +
 +/* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
 +   function, or horrible things will happen when doing free energy
 +   calculations!  In a good coding world, this would not be a
 +   different function, but for speed reasons, it needs to be made a
 +   separate function.  TODO for 5.0 - figure out a way to reorganize
 +   to reduce duplication.
 +*/
 +
 +static real calc_one_bond_foreign(FILE *fplog,int ftype, const t_idef *idef,
 +                                  rvec x[], rvec f[], t_forcerec *fr,
 +                                  const t_pbc *pbc,const t_graph *g,
 +                                  gmx_enerdata_t *enerd, t_nrnb *nrnb,
 +                                  real *lambda, real *dvdl,
 +                                  const t_mdatoms *md,t_fcdata *fcd,
 +                                  int *global_atom_index, gmx_bool bPrintSepPot)
 +{
 +    int ind,nat1,nbonds,efptFTYPE,nbonds_np;
 +    real v=0;
 +    t_iatom *iatoms;
 +
 +    if (IS_RESTRAINT_TYPE(ftype))
 +    {
 +        efptFTYPE = efptRESTRAINT;
 +    }
 +    else
 +    {
 +        efptFTYPE = efptBONDED;
 +    }
 +
 +    if (ftype<F_GB12 || ftype>F_GB14)
 +    {
 +        if (interaction_function[ftype].flags & IF_BOND &&
 +            !(ftype == F_CONNBONDS || ftype == F_POSRES || ftype == F_FBPOSRES))
 +        {
 +            ind  = interaction_function[ftype].nrnb_ind;
 +            nat1 = interaction_function[ftype].nratoms+1;
 +            nbonds_np = idef->il[ftype].nr_nonperturbed;
 +            nbonds    = idef->il[ftype].nr - nbonds_np;
 +            iatoms    = idef->il[ftype].iatoms + nbonds_np;
 +            if (nbonds > 0)
 +            {
 +                if (!IS_LISTED_LJ_C(ftype))
 +                {
 +                    if(ftype==F_CMAP)
 +                    {
 +                        v = cmap_dihs(nbonds,iatoms,
 +                                      idef->iparams,&idef->cmap_grid,
 +                                      (const rvec*)x,f,fr->fshift,
 +                                      pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),md,fcd,
 +                                      global_atom_index);
 +                    }
 +                    else
 +                    {
 +                        v =       interaction_function[ftype].ifunc(nbonds,iatoms,
 +                                                                  idef->iparams,
 +                                                                  (const rvec*)x,f,fr->fshift,
 +                                                                  pbc,g,lambda[efptFTYPE],&dvdl[efptFTYPE],
 +                                                                  md,fcd,global_atom_index);
 +                    }
 +                }
 +                else
 +                {
 +                    v = do_listed_vdw_q(ftype,nbonds,iatoms,
 +                                        idef->iparams,
 +                                        (const rvec*)x,f,fr->fshift,
 +                                        pbc,g,lambda,dvdl,
 +                                        md,fr,&enerd->grpp,global_atom_index);
 +                }
 +                if (ind != -1)
 +                {
 +                    inc_nrnb(nrnb,ind,nbonds/nat1);
 +                }
 +            }
 +        }
 +    }
 +    return v;
 +}
 +
 +void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
 +                const t_idef *idef,
 +                rvec x[],history_t *hist,
 +                rvec f[],t_forcerec *fr,
 +                const t_pbc *pbc,const t_graph *g,
 +                gmx_enerdata_t *enerd,t_nrnb *nrnb,
 +                real *lambda,
 +                const t_mdatoms *md,
 +                t_fcdata *fcd,int *global_atom_index,
 +                t_atomtypes *atype, gmx_genborn_t *born,
 +                int force_flags,
 +                gmx_bool bPrintSepPot,gmx_large_int_t step)
 +{
 +    gmx_bool bCalcEnerVir;
 +    int    i;
 +    real   v,dvdl[efptNR],dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
 +                                               of lambda, which will be thrown away in the end*/
 +    const  t_pbc *pbc_null;
 +    char   buf[22];
 +    int    thread;
 +
 +    bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        dvdl[i] = 0.0;
 +    }
 +    if (fr->bMolPBC)
 +    {
 +        pbc_null = pbc;
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +    if (bPrintSepPot)
 +    {
 +        fprintf(fplog,"Step %s: bonded V and dVdl for this node\n",
 +                gmx_step_str(step,buf));
 +    }
 +
 +#ifdef DEBUG
 +    if (g && debug)
 +    {
 +        p_graph(debug,"Bondage is fun",g);
 +    }
 +#endif
 +
 +    /* Do pre force calculation stuff which might require communication */
 +    if (idef->il[F_ORIRES].nr)
 +    {
 +        enerd->term[F_ORIRESDEV] =
 +            calc_orires_dev(ms,idef->il[F_ORIRES].nr,
 +                            idef->il[F_ORIRES].iatoms,
 +                            idef->iparams,md,(const rvec*)x,
 +                            pbc_null,fcd,hist);
 +    }
 +    if (idef->il[F_DISRES].nr)
 +    {
 +        calc_disres_R_6(ms,idef->il[F_DISRES].nr,
 +                        idef->il[F_DISRES].iatoms,
 +                        idef->iparams,(const rvec*)x,pbc_null,
 +                        fcd,hist);
 +    }
 +
 +#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 +    for(thread=0; thread<fr->nthreads; thread++)
 +    {
 +        int    ftype,nbonds,ind,nat1;
 +        real   *epot,v;
 +        /* thread stuff */
 +        rvec   *ft,*fshift;
 +        real   *dvdlt;
 +        gmx_grppairener_t *grpp;
 +        int    nb0,nbn;
 +
 +        if (thread == 0)
 +        {
 +            ft     = f;
 +            fshift = fr->fshift;
 +            epot   = enerd->term;
 +            grpp   = &enerd->grpp;
 +            dvdlt  = dvdl;
 +        }
 +        else
 +        {
 +            zero_thread_forces(&fr->f_t[thread],fr->natoms_force,
 +                               fr->red_nblock,1<<fr->red_ashift);
 +
 +            ft     = fr->f_t[thread].f;
 +            fshift = fr->f_t[thread].fshift;
 +            epot   = fr->f_t[thread].ener;
 +            grpp   = &fr->f_t[thread].grpp;
 +            dvdlt  = fr->f_t[thread].dvdl;
 +        }
 +        /* Loop over all bonded force types to calculate the bonded forces */
 +        for(ftype=0; (ftype<F_NRE); ftype++)
 +        {
 +            if (idef->il[ftype].nr > 0 &&
 +                (interaction_function[ftype].flags & IF_BOND) &&
 +                (ftype < F_GB12 || ftype > F_GB14) &&
 +                !(ftype == F_CONNBONDS || ftype == F_POSRES))
 +            {
 +                v = calc_one_bond(fplog,thread,ftype,idef,x, 
 +                                  ft,fshift,fr,pbc_null,g,enerd,grpp,
 +                                  nrnb,lambda,dvdlt,
 +                                  md,fcd,bCalcEnerVir,
 +                                  global_atom_index,bPrintSepPot);
 +                epot[ftype]        += v;
 +            }
 +        }
 +    }
 +    if (fr->nthreads > 1)
 +    {
 +        reduce_thread_forces(fr->natoms_force,f,fr->fshift,
 +                             enerd->term,&enerd->grpp,dvdl,
 +                             fr->nthreads,fr->f_t,
 +                             fr->red_nblock,1<<fr->red_ashift,
 +                             bCalcEnerVir,
 +                             force_flags & GMX_FORCE_DHDL);
 +    }
++    if (force_flags & GMX_FORCE_DHDL)
++    {
++        for(i=0; i<efptNR; i++)
++        {
++            enerd->dvdl_nonlin[i] += dvdl[i];
++        }
++    }
 +
 +    /* Copy the sum of violations for the distance restraints from fcd */
 +    if (fcd)
 +    {
 +        enerd->term[F_DISRESVIOL] = fcd->disres.sumviol;
 +
 +    }
 +}
 +
 +void calc_bonds_lambda(FILE *fplog,
 +                       const t_idef *idef,
 +                       rvec x[],
 +                       t_forcerec *fr,
 +                       const t_pbc *pbc,const t_graph *g,
 +                       gmx_enerdata_t *enerd,t_nrnb *nrnb,
 +                       real *lambda,
 +                       const t_mdatoms *md,
 +                       t_fcdata *fcd,
 +                       int *global_atom_index)
 +{
 +    int    i,ftype,nbonds_np,nbonds,ind,nat;
 +    real   v,dr,dr2,*epot;
 +    real   dvdl_dum[efptNR];
 +    rvec   *f,*fshift_orig;
 +    const  t_pbc *pbc_null;
 +    t_iatom *iatom_fe;
 +
 +    if (fr->bMolPBC)
 +    {
 +        pbc_null = pbc;
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +
 +    epot = enerd->term;
 +
 +    snew(f,fr->natoms_force);
 +    /* We want to preserve the fshift array in forcerec */
 +    fshift_orig = fr->fshift;
 +    snew(fr->fshift,SHIFTS);
 +
 +    /* Loop over all bonded force types to calculate the bonded forces */
 +    for(ftype=0; (ftype<F_NRE); ftype++) 
 +    {
 +        v = calc_one_bond_foreign(fplog,ftype,idef,x, 
 +                                  f,fr,pbc_null,g,enerd,nrnb,lambda,dvdl_dum,
 +                                  md,fcd,global_atom_index,FALSE);
 +        epot[ftype] += v;
 +    }
 +
 +    sfree(fr->fshift);
 +    fr->fshift = fshift_orig;
 +    sfree(f);
 +}
index 8edfb22717ba99afec184f692ffd0f892798ab53,0000000000000000000000000000000000000000..8be120d79df9068f3a1114eb6f2115cc472c1729
mode 100644,000000..100644
--- /dev/null
@@@ -1,445 -1,0 +1,447 @@@
- static omp_module_nthreads_t modth = { 0, 0, {0, 0, 0, 0, 0, 0, 0, 0}, FALSE};
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <assert.h>
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "gmx_fatal.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "network.h"
 +#include "statutil.h"
 +#include "gmx_omp.h"
 +#include "gmx_omp_nthreads.h"
 +#include "md_logging.h"
 +
 +/** Structure with the number of threads for each OpenMP multi-threaded
 + *  algorithmic module in mdrun. */
 +typedef struct
 +{
 +    int gnth;               /**< Global num. of threads per PP or PP+PME process/tMPI thread. */
 +    int gnth_pme;           /**< Global num. of threads per PME only process/tMPI thread. */
 +
 +    int nth[emntNR];        /**< Number of threads for each module, indexed with module_nth_t */
 +    gmx_bool initialized;   /**< TRUE if the module as been initialized. */
 +} omp_module_nthreads_t;
 +
 +/** Names of environment variables to set the per module number of threads.
 + *
 + *  Indexed with the values of module_nth_t.
 + * */
 +static const char *modth_env_var[emntNR] =
 +{
 +    "GMX_DEFAULT_NUM_THREADS should never be set",
 +    "GMX_DOMDEC_NUM_THREADS", "GMX_PAIRSEARCH_NUM_THREADS",
 +    "GMX_NONBONDED_NUM_THREADS", "GMX_BONDED_NUM_THREADS",
 +    "GMX_PME_NUM_THREADS", "GMX_UPDATE_NUM_THREADS",
++    "GMX_VSITE_NUM_THREADS",
 +    "GMX_LINCS_NUM_THREADS", "GMX_SETTLE_NUM_THREADS"
 +};
 +
 +/** Names of the modules. */
 +static const char *mod_name[emntNR] =
 +{
 +    "default", "domain decomposition", "pair search", "non-bonded",
 +    "bonded", "PME", "update", "LINCS", "SETTLE"
 +};
 +
 +/** Number of threads for each algorithmic module.
 + *
 + *  File-scope global variable that gets set once in \init_module_nthreads
 + *  and queried via gmx_omp_nthreads_get.
 + *
 + *  All fields are initialized to 0 which should result in errors if
 + *  the init call is omitted.
 + * */
++static omp_module_nthreads_t modth = { 0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0}, FALSE};
 +
 +
 +/** Determine the number of threads for module \mod.
 + *
 + *  \m takes values form the module_nth_t enum and maps these to the
 + *  corresponding value in modth_env_var.
 + *
 + *  Each number of threads per module takes the default value unless
 + *  GMX_*_NUM_THERADS env var is set, case in which its value overrides
 + *  the deafult.
 + *
 + *  The "group" scheme supports OpenMP only in PME and in thise case all but
 + *  the PME nthread values default to 1.
 + */
 +static int pick_module_nthreads(FILE *fplog, int m,
 +                                gmx_bool bSimMaster,
 +                                gmx_bool bFullOmpSupport,
 +                                gmx_bool bSepPME)
 +{
 +    char *env;
 +    int  nth;
 +    char sbuf[STRLEN];
 +    gmx_bool bOMP;
 +
 +#ifdef GMX_OPENMP
 +    bOMP = TRUE;
 +#else
 +    bOMP = FALSE;
 +#endif /* GMX_OPENMP */
 +
 +    /* The default should never be set through a GMX_*_NUM_THREADS env var
 +     * as it's always equal with gnth. */
 +    if (m == emntDefault)
 +    {
 +        return modth.nth[emntDefault];
 +    }
 +
 +    /* check the environment variable */
 +    if ((env = getenv(modth_env_var[m])) != NULL)
 +    {
 +        sscanf(env, "%d", &nth);
 +
 +        if (!bOMP)
 +        {
 +            gmx_warning("%s=%d is set, but %s is compiled without OpenMP!",
 +                        modth_env_var[m], nth, ShortProgram());
 +        }
 +
 +        /* with the verlet codepath, when any GMX_*_NUM_THREADS env var is set,
 +         * OMP_NUM_THREADS also has to be set */
 +        if (bFullOmpSupport && getenv("OMP_NUM_THREADS") == NULL)
 +        {
 +            gmx_fatal(FARGS, "%s=%d is set, the default number of threads also "
 +                      "needs to be set with OMP_NUM_THREADS!",
 +                      modth_env_var[m], nth);
 +        }
 +
 +        /* with the group scheme warn if any env var except PME is set */
 +        if (!bFullOmpSupport)
 +        {
 +            if (m != emntPME)
 +            {
 +                gmx_warning("%s=%d is set, but OpenMP multithreading is not "
 +                            "supported in %s!",
 +                            modth_env_var[m], nth, mod_name[m]);
 +                nth = 1;
 +            }
 +        }
 +
 +        /* only babble if we are really overriding with a different value */
 +        if ((bSepPME && m == emntPME && nth != modth.gnth_pme) || (nth != modth.gnth))
 +        {
 +            sprintf(sbuf, "%s=%d set, overriding the default number of %s threads",
 +                    modth_env_var[m], nth, mod_name[m]);
 +            if (bSimMaster)
 +            {
 +                fprintf(stderr, "\n%s\n", sbuf);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "%s\n", sbuf);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* pick the global PME node nthreads if we are setting the number
 +         * of threads in separate PME nodes  */
 +        nth = (bSepPME && m == emntPME) ? modth.gnth_pme : modth.gnth;
 +    }
 +
 +    return modth.nth[m] = nth;
 +}
 +
 +void gmx_omp_nthreads_read_env(int *nthreads_omp)
 +{
 +    char *env;
 +
 +    assert(nthreads_omp);
 +
 +    if ((env = getenv("OMP_NUM_THREADS")) != NULL)
 +    {
 +        int nt_omp;
 +
 +        sscanf(env,"%d",&nt_omp);
 +        if (nt_omp <= 0)
 +        {
 +            gmx_fatal(FARGS,"OMP_NUM_THREADS is invalid: '%s'",env);
 +        }
 +
 +        if (*nthreads_omp > 0 && nt_omp != *nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"OMP_NUM_THREADS (%d) and the number of threads requested on the command line (%d) have different values",nt_omp,*nthreads_omp);
 +        }
 +
 +        /* Setting the number of OpenMP threads.
 +         * NOTE: with tMPI this function is only called on the master node,
 +         * but with MPI on all nodes which means lots of messages on stderr.
 +         */
 +        fprintf(stderr,"Getting the number of OpenMP threads from OMP_NUM_THREADS: %d\n",nt_omp);
 +        *nthreads_omp = nt_omp;
 +    }
 +}
 +
 +void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
 +                           int nthreads_hw_avail,
 +                           int omp_nthreads_req,
 +                           int omp_nthreads_pme_req,
 +                           gmx_bool bThisNodePMEOnly,
 +                           gmx_bool bFullOmpSupport)
 +{
 +    int  nth, nth_pmeonly, gmx_maxth, nppn;
 +    char *env;
 +    gmx_bool bSepPME, bOMP;
 +
 +#ifdef GMX_OPENMP
 +    bOMP = TRUE;
 +#else
 +    bOMP = FALSE;
 +#endif /* GMX_OPENMP */
 +
 +    /* number of processes per node */
 +    nppn = cr->nnodes_intra;
 +
 +    bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
 +              (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));
 +
 +#ifdef GMX_THREAD_MPI
 +    /* modth is shared among tMPI threads, so for thread safety do the
 +     * detection is done on the master only. It is not thread-safe with
 +     * multiple simulations, but that's anyway not supported by tMPI. */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        /* just return if the initialization has already been done */
 +        if (modth.initialized)
 +        {
 +            return;
 +        }
 +
 +        /* With full OpenMP support (verlet scheme) set the number of threads
 +         * per process / default:
 +         * - 1 if not compiled with OpenMP or
 +         * - OMP_NUM_THREADS if the env. var is set, or
 +         * - omp_nthreads_req = #of threads requested by the user on the mdrun
 +         *   command line, otherwise
 +         * - take the max number of available threads and distribute them
 +         *   on the processes/tMPI threads.
 +         * ~ The GMX_*_NUM_THREADS env var overrides the number of threads of
 +         *   the respective module and it has to be used in conjunction with
 +         *   OMP_NUM_THREADS.
 +         *
 +         * With the group scheme OpenMP multithreading is only supported in PME,
 +         * for all other modules nthreads is set to 1.
 +         * The number of PME threads is equal to:
 +         * - 1 if not compiled with OpenMP or
 +         * - GMX_PME_NUM_THREADS if defined, otherwise
 +         * - OMP_NUM_THREADS if defined, otherwise
 +         * - 1
 +         */
 +        nth = 1;
 +        if ((env = getenv("OMP_NUM_THREADS")) != NULL)
 +        {
 +            if (!bOMP && (strncmp(env, "1", 1) != 0))
 +            {
 +                gmx_warning("OMP_NUM_THREADS is set, but %s was compiled without OpenMP support!",
 +                            ShortProgram());
 +            }
 +            else
 +            {
 +                nth = gmx_omp_get_max_threads();
 +            }
 +        }
 +        else if (omp_nthreads_req > 0)
 +        {
 +            nth = omp_nthreads_req;
 +        }
 +        else if (bFullOmpSupport && bOMP)
 +        {
 +            /* max available threads per node */
 +            nth = nthreads_hw_avail;
 +
 +            /* divide the threads among the MPI processes/tMPI threads */
 +            if (nth >= nppn)
 +            {
 +                nth /= nppn;
 +            }
 +            else
 +            {
 +                nth = 1;
 +            }
 +        }
 +
 +        /* now we have the global values, set them:
 +         * - 1 if not compiled with OpenMP and for the group scheme
 +         * - nth for the verlet scheme when compiled with OpenMP
 +         */
 +        if (bFullOmpSupport && bOMP)
 +        {
 +            modth.gnth = nth;
 +        }
 +        else
 +        {
 +            modth.gnth = 1;
 +        }
 +
 +        if (bSepPME)
 +        {
 +            if (omp_nthreads_pme_req > 0)
 +            {
 +                modth.gnth_pme = omp_nthreads_pme_req;
 +            }
 +            else
 +            {
 +                modth.gnth_pme = nth;
 +            }
 +        }
 +        else
 +        {
 +            modth.gnth_pme = 0;
 +        }
 +
 +        /* now set the per-module values */
 +        modth.nth[emntDefault] = modth.gnth;
 +        pick_module_nthreads(fplog, emntDomdec, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntPairsearch, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntNonbonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntBonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntPME, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntUpdate, SIMMASTER(cr), bFullOmpSupport, bSepPME);
++        pick_module_nthreads(fplog, emntVSITE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntLINCS, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntSETTLE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +
 +        /* set the number of threads globally */
 +        if (bOMP)
 +        {
 +#ifndef GMX_THREAD_MPI
 +            if (bThisNodePMEOnly)
 +            {
 +                gmx_omp_set_num_threads(modth.gnth_pme);
 +            }
 +            else
 +#endif /* GMX_THREAD_MPI */
 +            {
 +                if (bFullOmpSupport)
 +                {
 +                    gmx_omp_set_num_threads(nth);
 +                }
 +                else
 +                {
 +                    gmx_omp_set_num_threads(1);
 +                }
 +            }
 +        }
 +
 +        modth.initialized = TRUE;
 +    }
 +#ifdef GMX_THREAD_MPI
 +    /* Non-master threads have to wait for the detection to be done. */
 +    if (PAR(cr))
 +    {
 +        MPI_Barrier(cr->mpi_comm_mysim);
 +    }
 +#endif
 +
 +    /* inform the user about the settings */
 +    if (SIMMASTER(cr) && bOMP)
 +    {
 +#ifdef GMX_THREAD_MPI
 +        const char *mpi_str="per tMPI thread";
 +#else
 +        const char *mpi_str="per MPI process";
 +#endif
 +
 +        /* for group scheme we print PME threads info only */
 +        if (bFullOmpSupport)
 +        {
 +            fprintf(stderr, "Using %d OpenMP thread%s %s\n",
 +                    modth.gnth,modth.gnth > 1 ? "s" : "",
 +                    cr->nnodes > 1 ? mpi_str : "");
 +        }
 +        if (bSepPME && modth.gnth_pme != modth.gnth)
 +        {
 +            fprintf(stderr, "Using %d OpenMP thread%s %s for PME\n",
 +                    modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
 +                    cr->nnodes > 1 ? mpi_str : "");
 +        }
 +    }
 +
 +    /* detect and warn about oversubscription
 +     * TODO: enable this for separate PME nodes as well! */
 +    if (!bSepPME && cr->nodeid_intra == 0)
 +    {
 +        char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
 +
 +        if (modth.gnth*nppn > nthreads_hw_avail)
 +        {
 +            sprintf(sbuf, "threads");
 +            sbuf1[0] = '\0';
 +            sprintf(sbuf2, "O");
 +#ifdef GMX_MPI
 +            if (modth.gnth == 1)
 +            {
 +#ifdef GMX_THREAD_MPI
 +                sprintf(sbuf, "thread-MPI threads");
 +#else
 +                sprintf(sbuf, "MPI processes");
 +                sprintf(sbuf1, " per node");
 +                sprintf(sbuf2, "On node %d: o", cr->sim_nodeid);
 +#endif
 +            }
 +#endif
 +            md_print_warn(cr, fplog,
 +                          "WARNING: %sversubscribing the available %d logical CPU cores%s with %d %s.\n"
 +                          "         This will cause considerable performance loss!",
 +                          sbuf2, nthreads_hw_avail, sbuf1, nppn*modth.gnth, sbuf);
 +        }
 +    }
 +}
 +
 +int gmx_omp_nthreads_get(int mod)
 +{
 +    if (mod < 0 || mod >= emntNR)
 +    {
 +        /* invalid module queried */
 +        return -1;
 +    }
 +    else
 +    {
 +        return modth.nth[mod];
 +    }
 +}
Simple merge
Simple merge
Simple merge
index 18d41f059a74b7286f2553428c8b1b35d683ce28,0000000000000000000000000000000000000000..519f5353251c4d2e003f76ada13a70171a4b4807
mode 100644,000000..100644
--- /dev/null
@@@ -1,1374 -1,0 +1,1376 @@@
-     tensor           *rmdr_th;   /* Thread local working data          */
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "confio.h"
 +#include "constr.h"
 +#include "copyrite.h"
 +#include "invblock.h"
 +#include "main.h"
 +#include "mdrun.h"
 +#include "nrnb.h"
 +#include "smalloc.h"
 +#include "vec.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "txtdump.h"
 +#include "domdec.h"
 +#include "pdbio.h"
 +#include "partdec.h"
 +#include "splitter.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "macros.h"
 +#include "gmx_omp_nthreads.h"
 +
 +typedef struct gmx_constr {
 +  int              ncon_tot;     /* The total number of constraints    */
 +  int              nflexcon;     /* The number of flexible constraints */
 +  int              n_at2con_mt;  /* The size of at2con = #moltypes     */
 +  t_blocka         *at2con_mt;   /* A list of atoms to constraints     */
 +  int              n_at2settle_mt; /* The size of at2settle = #moltypes  */
 +  int              **at2settle_mt; /* A list of atoms to settles         */
 +  gmx_bool         bInterCGsettles;
 +  gmx_lincsdata_t  lincsd;       /* LINCS data                         */
 +  gmx_shakedata_t  shaked;       /* SHAKE data                         */
 +  gmx_settledata_t settled;      /* SETTLE data                        */
 +  int              nblocks;      /* The number of SHAKE blocks         */
 +  int              *sblock;      /* The SHAKE blocks                   */
 +  int              sblock_nalloc;/* The allocation size of sblock      */
 +  real             *lagr;        /* Lagrange multipliers for SHAKE     */
 +  int              lagr_nalloc;  /* The allocation size of lagr        */
 +  int              maxwarn;      /* The maximum number of warnings     */
 +  int              warncount_lincs;
 +  int              warncount_settle;
 +  gmx_edsam_t      ed;           /* The essential dynamics data        */
 +
-     tensor  rmdr;
++    tensor           *vir_r_m_dr_th;/* Thread local working data          */
 +    int              *settle_error; /* Thread local working data          */
 +
 +  gmx_mtop_t       *warn_mtop;   /* Only used for printing warnings    */
 +} t_gmx_constr;
 +
 +typedef struct {
 +  atom_id iatom[3];
 +  atom_id blocknr;
 +} t_sortblock;
 +
 +static void *init_vetavars(t_vetavars *vars,
 +                           gmx_bool constr_deriv,
 +                           real veta,real vetanew, t_inputrec *ir, gmx_ekindata_t *ekind, gmx_bool bPscal) 
 +{
 +    double g;
 +    int i;
 +
 +    /* first, set the alpha integrator variable */
 +    if ((ir->opts.nrdf[0] > 0) && bPscal) 
 +    {
 +        vars->alpha = 1.0 + DIM/((double)ir->opts.nrdf[0]);  
 +    } else {
 +        vars->alpha = 1.0;
 +    }
 +    g = 0.5*veta*ir->delta_t;
 +    vars->rscale = exp(g)*series_sinhx(g);
 +    g = -0.25*vars->alpha*veta*ir->delta_t;
 +    vars->vscale = exp(g)*series_sinhx(g);
 +    vars->rvscale = vars->vscale*vars->rscale;
 +    vars->veta = vetanew;
 +
 +    if (constr_deriv)
 +    {
 +        snew(vars->vscale_nhc,ir->opts.ngtc);
 +        if ((ekind==NULL) || (!bPscal))
 +        {
 +            for (i=0;i<ir->opts.ngtc;i++)
 +            {
 +                vars->vscale_nhc[i] = 1;
 +            }
 +        }
 +        else
 +        {
 +            for (i=0;i<ir->opts.ngtc;i++)
 +            {
 +                vars->vscale_nhc[i] = ekind->tcstat[i].vscale_nhc;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        vars->vscale_nhc = NULL;
 +    }
 +
 +    return vars;
 +}
 +
 +static void free_vetavars(t_vetavars *vars) 
 +{
 +    if (vars->vscale_nhc != NULL)
 +    {
 +        sfree(vars->vscale_nhc);
 +    }
 +}
 +
 +static int pcomp(const void *p1, const void *p2)
 +{
 +  int     db;
 +  atom_id min1,min2,max1,max2;
 +  t_sortblock *a1=(t_sortblock *)p1;
 +  t_sortblock *a2=(t_sortblock *)p2;
 +  
 +  db=a1->blocknr-a2->blocknr;
 +  
 +  if (db != 0)
 +    return db;
 +    
 +  min1=min(a1->iatom[1],a1->iatom[2]);
 +  max1=max(a1->iatom[1],a1->iatom[2]);
 +  min2=min(a2->iatom[1],a2->iatom[2]);
 +  max2=max(a2->iatom[1],a2->iatom[2]);
 +  
 +  if (min1 == min2)
 +    return max1-max2;
 +  else
 +    return min1-min2;
 +}
 +
 +int n_flexible_constraints(struct gmx_constr *constr)
 +{
 +  int nflexcon;
 +
 +  if (constr)
 +    nflexcon = constr->nflexcon;
 +  else
 +    nflexcon = 0;
 +
 +  return nflexcon;
 +}
 +
 +void too_many_constraint_warnings(int eConstrAlg,int warncount)
 +{
 +  const char *abort="- aborting to avoid logfile runaway.\n"
 +    "This normally happens when your system is not sufficiently equilibrated,"
 +    "or if you are changing lambda too fast in free energy simulations.\n";
 +  
 +  gmx_fatal(FARGS,
 +          "Too many %s warnings (%d)\n"
 +          "If you know what you are doing you can %s"
 +          "set the environment variable GMX_MAXCONSTRWARN to -1,\n"
 +          "but normally it is better to fix the problem",
 +          (eConstrAlg == econtLINCS) ? "LINCS" : "SETTLE",warncount,
 +          (eConstrAlg == econtLINCS) ?
 +          "adjust the lincs warning threshold in your mdp file\nor " : "\n");
 +}
 +
 +static void write_constr_pdb(const char *fn,const char *title,
 +                             gmx_mtop_t *mtop,
 +                             int start,int homenr,t_commrec *cr,
 +                             rvec x[],matrix box)
 +{
 +    char fname[STRLEN],format[STRLEN];
 +    FILE *out;
 +    int  dd_ac0=0,dd_ac1=0,i,ii,resnr;
 +    gmx_domdec_t *dd;
 +    char *anm,*resnm;
 +  
 +    dd = NULL;
 +    if (PAR(cr))
 +    {
 +        sprintf(fname,"%s_n%d.pdb",fn,cr->sim_nodeid);
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd = cr->dd;
 +            dd_get_constraint_range(dd,&dd_ac0,&dd_ac1);
 +            start = 0;
 +            homenr = dd_ac1;
 +        }
 +    }
 +    else
 +    {
 +        sprintf(fname,"%s.pdb",fn);
 +    }
 +    sprintf(format,"%s\n",get_pdbformat());
 +    
 +    out = gmx_fio_fopen(fname,"w");
 +    
 +    fprintf(out,"TITLE     %s\n",title);
 +    gmx_write_pdb_box(out,-1,box);
 +    for(i=start; i<start+homenr; i++)
 +    {
 +        if (dd != NULL)
 +        {
 +            if (i >= dd->nat_home && i < dd_ac0)
 +            {
 +                continue;
 +            }
 +            ii = dd->gatindex[i];
 +        }
 +        else
 +        {
 +            ii = i;
 +        }
 +        gmx_mtop_atominfo_global(mtop,ii,&anm,&resnr,&resnm);
 +        fprintf(out,format,"ATOM",(ii+1)%100000,
 +                anm,resnm,' ',resnr%10000,' ',
 +                10*x[i][XX],10*x[i][YY],10*x[i][ZZ]);
 +    }
 +    fprintf(out,"TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +                           
 +static void dump_confs(FILE *fplog,gmx_large_int_t step,gmx_mtop_t *mtop,
 +                     int start,int homenr,t_commrec *cr,
 +                     rvec x[],rvec xprime[],matrix box)
 +{
 +  char buf[256],buf2[22];
 + 
 +  char *env=getenv("GMX_SUPPRESS_DUMP");
 +  if (env)
 +      return; 
 +  
 +  sprintf(buf,"step%sb",gmx_step_str(step,buf2));
 +  write_constr_pdb(buf,"initial coordinates",
 +                 mtop,start,homenr,cr,x,box);
 +  sprintf(buf,"step%sc",gmx_step_str(step,buf2));
 +  write_constr_pdb(buf,"coordinates after constraining",
 +                 mtop,start,homenr,cr,xprime,box);
 +  if (fplog)
 +  {
 +      fprintf(fplog,"Wrote pdb files with previous and current coordinates\n");
 +  }
 +  fprintf(stderr,"Wrote pdb files with previous and current coordinates\n");
 +}
 +
 +static void pr_sortblock(FILE *fp,const char *title,int nsb,t_sortblock sb[])
 +{
 +  int i;
 +  
 +  fprintf(fp,"%s\n",title);
 +  for(i=0; (i<nsb); i++)
 +    fprintf(fp,"i: %5d, iatom: (%5d %5d %5d), blocknr: %5d\n",
 +          i,sb[i].iatom[0],sb[i].iatom[1],sb[i].iatom[2],
 +          sb[i].blocknr);
 +}
 +
 +gmx_bool constrain(FILE *fplog,gmx_bool bLog,gmx_bool bEner,
 +                   struct gmx_constr *constr,
 +                   t_idef *idef,t_inputrec *ir,gmx_ekindata_t *ekind,
 +                   t_commrec *cr,
 +                   gmx_large_int_t step,int delta_step,
 +                   t_mdatoms *md,
 +                   rvec *x,rvec *xprime,rvec *min_proj,
 +                   gmx_bool bMolPBC,matrix box,
 +                   real lambda,real *dvdlambda,
 +                   rvec *v,tensor *vir,
 +                   t_nrnb *nrnb,int econq,gmx_bool bPscal,
 +                   real veta, real vetanew)
 +{
 +    gmx_bool    bOK,bDump;
 +    int     start,homenr,nrend;
 +    int     i,j,d;
 +    int     ncons,settle_error;
-         clear_mat(rmdr);
++    tensor  vir_r_m_dr;
 +    rvec    *vstor;
 +    real    invdt,vir_fac,t;
 +    t_ilist *settle;
 +    int     nsettle;
 +    t_pbc   pbc,*pbc_null;
 +    char    buf[22];
 +    t_vetavars vetavar;
 +    int     nth,th;
 +
 +    if (econq == econqForceDispl && !EI_ENERGY_MINIMIZATION(ir->eI))
 +    {
 +        gmx_incons("constrain called for forces displacements while not doing energy minimization, can not do this while the LINCS and SETTLE constraint connection matrices are mass weighted");
 +    }
 +    
 +    bOK   = TRUE;
 +    bDump = FALSE;
 +    
 +    start  = md->start;
 +    homenr = md->homenr;
 +    nrend = start+homenr;
 +
 +    /* set constants for pressure control integration */ 
 +    init_vetavars(&vetavar,econq!=econqCoord,
 +                  veta,vetanew,ir,ekind,bPscal);
 +
 +    if (ir->delta_t == 0)
 +    {
 +        invdt = 0;
 +    }
 +    else
 +    {
 +        invdt  = 1/ir->delta_t;
 +    }
 +
 +    if (ir->efep != efepNO && EI_DYNAMICS(ir->eI))
 +    {
 +        /* Set the constraint lengths for the step at which this configuration
 +         * is meant to be. The invmasses should not be changed.
 +         */
 +        lambda += delta_step*ir->fepvals->delta_lambda;
 +    }
 +    
 +    if (vir != NULL)
 +    {
-     if (nth > 1 && constr->rmdr_th == NULL)
++        clear_mat(vir_r_m_dr);
 +    }
 +    
 +    where();
 +
 +    settle  = &idef->il[F_SETTLE];
 +    nsettle = settle->nr/(1+NRAL(F_SETTLE));
 +
 +    if (nsettle > 0)
 +    {
 +        nth = gmx_omp_nthreads_get(emntSETTLE);
 +    }
 +    else
 +    {
 +        nth = 1;
 +    }
 +
-         snew(constr->rmdr_th,nth);
++    if (nth > 1 && constr->vir_r_m_dr_th == NULL)
 +    {
-                               invdt,v,vir!=NULL,rmdr,
++        snew(constr->vir_r_m_dr_th,nth);
 +        snew(constr->settle_error,nth);
 +    }
 +    
 +    settle_error = -1;
 +
 +    /* We do not need full pbc when constraints do not cross charge groups,
 +     * i.e. when dd->constraint_comm==NULL.
 +     * Note that PBC for constraints is different from PBC for bondeds.
 +     * For constraints there is both forward and backward communication.
 +     */
 +    if (ir->ePBC != epbcNONE &&
 +        (cr->dd || bMolPBC) && !(cr->dd && cr->dd->constraint_comm==NULL))
 +    {
 +        /* With pbc=screw the screw has been changed to a shift
 +         * by the constraint coordinate communication routine,
 +         * so that here we can use normal pbc.
 +         */
 +        pbc_null = set_pbc_dd(&pbc,ir->ePBC,cr->dd,FALSE,box);
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +
 +    /* Communicate the coordinates required for the non-local constraints
 +     * for LINCS and/or SETTLE.
 +     */
 +    if (cr->dd)
 +    {
 +        dd_move_x_constraints(cr->dd,box,x,xprime);
 +    }
 +      else if (PARTDECOMP(cr))
 +      {
 +              pd_move_x_constraints(cr,x,xprime);
 +      }       
 +
 +    if (constr->lincsd != NULL)
 +    {
 +        bOK = constrain_lincs(fplog,bLog,bEner,ir,step,constr->lincsd,md,cr,
 +                              x,xprime,min_proj,
 +                              box,pbc_null,lambda,dvdlambda,
-                           invdt,v,vir!=NULL,rmdr,constr->maxwarn>=0,econq,&vetavar);
++                              invdt,v,vir!=NULL,vir_r_m_dr,
 +                              econq,nrnb,
 +                              constr->maxwarn,&constr->warncount_lincs);
 +        if (!bOK && constr->maxwarn >= 0)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"Constraint error in algorithm %s at step %s\n",
 +                        econstr_names[econtLINCS],gmx_step_str(step,buf));
 +            }
 +            bDump = TRUE;
 +        }
 +    } 
 +    
 +    if (constr->nblocks > 0)
 +    {
 +        switch (econq) {
 +        case (econqCoord):
 +            bOK = bshakef(fplog,constr->shaked,
 +                          homenr,md->invmass,constr->nblocks,constr->sblock,
 +                          idef,ir,x,xprime,nrnb,
 +                          constr->lagr,lambda,dvdlambda,
-                           invdt,NULL,vir!=NULL,rmdr,constr->maxwarn>=0,econq,&vetavar);
++                          invdt,v,vir!=NULL,vir_r_m_dr,
++                          constr->maxwarn>=0,econq,&vetavar);
 +            break;
 +        case (econqVeloc):
 +            bOK = bshakef(fplog,constr->shaked,
 +                          homenr,md->invmass,constr->nblocks,constr->sblock,
 +                          idef,ir,x,min_proj,nrnb,
 +                          constr->lagr,lambda,dvdlambda,
-                     clear_mat(constr->rmdr_th[th]);
++                          invdt,NULL,vir!=NULL,vir_r_m_dr,
++                          constr->maxwarn>=0,econq,&vetavar);
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Internal error, SHAKE called for constraining something else than coordinates");
 +            break;
 +        }
 +        
 +        if (!bOK && constr->maxwarn >= 0)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"Constraint error in algorithm %s at step %s\n",
 +                        econstr_names[econtSHAKE],gmx_step_str(step,buf));
 +            }
 +            bDump = TRUE;
 +        }
 +    }
 +    
 +    if (nsettle > 0)
 +    {
 +        int calcvir_atom_end;
 +
 +        if (vir == NULL)
 +        {
 +            calcvir_atom_end = 0;
 +        }
 +        else
 +        {
 +            calcvir_atom_end = md->start + md->homenr;
 +        }
 +
 +        switch (econq)
 +        {
 +        case econqCoord:
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +            for(th=0; th<nth; th++)
 +            {
 +                int start_th,end_th;
 +
 +                if (th > 0)
 +                {
-                             th == 0 ? rmdr : constr->rmdr_th[th],
++                    clear_mat(constr->vir_r_m_dr_th[th]);
 +                }
 +
 +                start_th = (nsettle* th   )/nth;
 +                end_th   = (nsettle*(th+1))/nth;
 +                if (start_th >= 0 && end_th - start_th > 0)
 +                {
 +                    csettle(constr->settled,
 +                            end_th-start_th,
 +                            settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
 +                            pbc_null,
 +                            x[0],xprime[0],
 +                            invdt,v?v[0]:NULL,calcvir_atom_end,
-                     clear_mat(constr->rmdr_th[th]);
++                            th == 0 ? vir_r_m_dr : constr->vir_r_m_dr_th[th],
 +                            th == 0 ? &settle_error : &constr->settle_error[th],
 +                            &vetavar);
 +                }
 +            }
 +            inc_nrnb(nrnb,eNR_SETTLE,nsettle);
 +            if (v != NULL)
 +            {
 +                inc_nrnb(nrnb,eNR_CONSTR_V,nsettle*3);
 +            }
 +            if (vir != NULL)
 +            {
 +                inc_nrnb(nrnb,eNR_CONSTR_VIR,nsettle*3);
 +            }
 +            break;
 +        case econqVeloc:
 +        case econqDeriv:
 +        case econqForce:
 +        case econqForceDispl:
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +            for(th=0; th<nth; th++)
 +            {
 +                int start_th,end_th;
 +
 +                if (th > 0)
 +                {
-                                 th == 0 ? rmdr : constr->rmdr_th[th],
++                    clear_mat(constr->vir_r_m_dr_th[th]);
 +                }
 +                
 +                start_th = (nsettle* th   )/nth;
 +                end_th   = (nsettle*(th+1))/nth;
 +
 +                if (start_th >= 0 && end_th - start_th > 0)
 +                {
 +                    settle_proj(fplog,constr->settled,econq,
 +                                end_th-start_th,
 +                                settle->iatoms+start_th*(1+NRAL(F_SETTLE)),
 +                                pbc_null,
 +                                x,
 +                                xprime,min_proj,calcvir_atom_end,
-             m_add(rmdr,constr->rmdr_th[i],rmdr);
++                                th == 0 ? vir_r_m_dr : constr->vir_r_m_dr_th[th],
 +                                &vetavar);
 +                }
 +            }
 +            /* This is an overestimate */
 +            inc_nrnb(nrnb,eNR_SETTLE,nsettle);
 +            break;
 +        case econqDeriv_FlexCon:
 +            /* Nothing to do, since the are no flexible constraints in settles */
 +            break;
 +        default:
 +            gmx_incons("Unknown constraint quantity for settle");
 +        }
 +    }
 +
 +    if (settle->nr > 0)
 +    {
 +        /* Combine virial and error info of the other threads */
 +        for(i=1; i<nth; i++)
 +        {
-                 (*vir)[i][j] = vir_fac*rmdr[i][j];
++            m_add(vir_r_m_dr,constr->vir_r_m_dr_th[i],vir_r_m_dr);
 +            settle_error = constr->settle_error[i];
 +        } 
 +
 +        if (econq == econqCoord && settle_error >= 0)
 +        {
 +            bOK = FALSE;
 +            if (constr->maxwarn >= 0)
 +            {
 +                char buf[256];
 +                sprintf(buf,
 +                        "\nstep " gmx_large_int_pfmt ": Water molecule starting at atom %d can not be "
 +                        "settled.\nCheck for bad contacts and/or reduce the timestep if appropriate.\n",
 +                        step,ddglatnr(cr->dd,settle->iatoms[settle_error*(1+NRAL(F_SETTLE))+1]));
 +                if (fplog)
 +                {
 +                    fprintf(fplog,"%s",buf);
 +                }
 +                fprintf(stderr,"%s",buf);
 +                constr->warncount_settle++;
 +                if (constr->warncount_settle > constr->maxwarn)
 +                {
 +                    too_many_constraint_warnings(-1,constr->warncount_settle);
 +                }
 +                bDump = TRUE;
 +            }
 +        }
 +    }
 +        
 +    free_vetavars(&vetavar);
 +    
 +    if (vir != NULL)
 +    {
 +        switch (econq)
 +        {
 +        case econqCoord:
 +            vir_fac = 0.5/(ir->delta_t*ir->delta_t);
 +            break;
 +        case econqVeloc:
 +            vir_fac = 0.5/ir->delta_t;
 +            break;
 +        case econqForce:
 +        case econqForceDispl:
 +            vir_fac = 0.5;
 +            break;
 +        default:
 +            vir_fac = 0;
 +            gmx_incons("Unsupported constraint quantity for virial");
 +        }
 +        
 +        if (EI_VV(ir->eI))
 +        {
 +            vir_fac *= 2;  /* only constraining over half the distance here */
 +        }
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
++                (*vir)[i][j] = vir_fac*vir_r_m_dr[i][j];
 +            }
 +        }
 +    }
 +    
 +    if (bDump)
 +    {
 +        dump_confs(fplog,step,constr->warn_mtop,start,homenr,cr,x,xprime,box);
 +    }
 +    
 +    if (econq == econqCoord)
 +    {
 +        if (ir->ePull == epullCONSTRAINT)
 +        {
 +            if (EI_DYNAMICS(ir->eI))
 +            {
 +                t = ir->init_t + (step + delta_step)*ir->delta_t;
 +            }
 +            else
 +            {
 +                t = ir->init_t;
 +            }
 +            set_pbc(&pbc,ir->ePBC,box);
 +            pull_constraint(ir->pull,md,&pbc,cr,ir->delta_t,t,x,xprime,v,*vir);
 +        }
 +        if (constr->ed && delta_step > 0)
 +        {
 +            /* apply the essential dynamcs constraints here */
 +            do_edsam(ir,step,md,cr,xprime,v,box,constr->ed);
 +        }
 +    }
 +    
 +    return bOK;
 +}
 +
 +real *constr_rmsd_data(struct gmx_constr *constr)
 +{
 +  if (constr->lincsd)
 +    return lincs_rmsd_data(constr->lincsd);
 +  else
 +    return NULL;
 +}
 +
 +real constr_rmsd(struct gmx_constr *constr,gmx_bool bSD2)
 +{
 +  if (constr->lincsd)
 +    return lincs_rmsd(constr->lincsd,bSD2);
 +  else
 +    return 0;
 +}
 +
 +static void make_shake_sblock_pd(struct gmx_constr *constr,
 +                               t_idef *idef,t_mdatoms *md)
 +{
 +  int  i,j,m,ncons;
 +  int  bstart,bnr;
 +  t_blocka    sblocks;
 +  t_sortblock *sb;
 +  t_iatom     *iatom;
 +  atom_id     *inv_sblock;
 +
 +  /* Since we are processing the local topology,
 +   * the F_CONSTRNC ilist has been concatenated to the F_CONSTR ilist.
 +   */
 +  ncons = idef->il[F_CONSTR].nr/3;
 +
 +  init_blocka(&sblocks);
 +  gen_sblocks(NULL,md->start,md->start+md->homenr,idef,&sblocks,FALSE);
 +  
 +  /*
 +    bstart=(idef->nodeid > 0) ? blocks->multinr[idef->nodeid-1] : 0;
 +    nblocks=blocks->multinr[idef->nodeid] - bstart;
 +  */
 +  bstart  = 0;
 +  constr->nblocks = sblocks.nr;
 +  if (debug) 
 +    fprintf(debug,"ncons: %d, bstart: %d, nblocks: %d\n",
 +          ncons,bstart,constr->nblocks);
 +  
 +  /* Calculate block number for each atom */
 +  inv_sblock = make_invblocka(&sblocks,md->nr);
 +  
 +  done_blocka(&sblocks);
 +  
 +  /* Store the block number in temp array and
 +   * sort the constraints in order of the sblock number 
 +   * and the atom numbers, really sorting a segment of the array!
 +   */
 +#ifdef DEBUGIDEF 
 +  pr_idef(fplog,0,"Before Sort",idef);
 +#endif
 +  iatom=idef->il[F_CONSTR].iatoms;
 +  snew(sb,ncons);
 +  for(i=0; (i<ncons); i++,iatom+=3) {
 +    for(m=0; (m<3); m++)
 +      sb[i].iatom[m] = iatom[m];
 +    sb[i].blocknr = inv_sblock[iatom[1]];
 +  }
 +  
 +  /* Now sort the blocks */
 +  if (debug) {
 +    pr_sortblock(debug,"Before sorting",ncons,sb);
 +    fprintf(debug,"Going to sort constraints\n");
 +  }
 +  
 +  qsort(sb,ncons,(size_t)sizeof(*sb),pcomp);
 +  
 +  if (debug) {
 +    pr_sortblock(debug,"After sorting",ncons,sb);
 +  }
 +  
 +  iatom=idef->il[F_CONSTR].iatoms;
 +  for(i=0; (i<ncons); i++,iatom+=3) 
 +    for(m=0; (m<3); m++)
 +      iatom[m]=sb[i].iatom[m];
 +#ifdef DEBUGIDEF
 +  pr_idef(fplog,0,"After Sort",idef);
 +#endif
 +  
 +  j=0;
 +  snew(constr->sblock,constr->nblocks+1);
 +  bnr=-2;
 +  for(i=0; (i<ncons); i++) {
 +    if (sb[i].blocknr != bnr) {
 +      bnr=sb[i].blocknr;
 +      constr->sblock[j++]=3*i;
 +    }
 +  }
 +  /* Last block... */
 +  constr->sblock[j++] = 3*ncons;
 +  
 +  if (j != (constr->nblocks+1)) {
 +    fprintf(stderr,"bstart: %d\n",bstart);
 +    fprintf(stderr,"j: %d, nblocks: %d, ncons: %d\n",
 +          j,constr->nblocks,ncons);
 +    for(i=0; (i<ncons); i++)
 +      fprintf(stderr,"i: %5d  sb[i].blocknr: %5u\n",i,sb[i].blocknr);
 +    for(j=0; (j<=constr->nblocks); j++)
 +      fprintf(stderr,"sblock[%3d]=%5d\n",j,(int)constr->sblock[j]);
 +    gmx_fatal(FARGS,"DEATH HORROR: "
 +            "sblocks does not match idef->il[F_CONSTR]");
 +  }
 +  sfree(sb);
 +  sfree(inv_sblock);
 +}
 +
 +static void make_shake_sblock_dd(struct gmx_constr *constr,
 +                               t_ilist *ilcon,t_block *cgs,
 +                               gmx_domdec_t *dd)
 +{
 +  int ncons,c,cg;
 +  t_iatom *iatom;
 +
 +  if (dd->ncg_home+1 > constr->sblock_nalloc) {
 +    constr->sblock_nalloc = over_alloc_dd(dd->ncg_home+1);
 +    srenew(constr->sblock,constr->sblock_nalloc);
 +  }
 +  
 +  ncons = ilcon->nr/3;
 +  iatom = ilcon->iatoms;
 +  constr->nblocks = 0;
 +  cg = 0;
 +  for(c=0; c<ncons; c++) {
 +    if (c == 0 || iatom[1] >= cgs->index[cg+1]) {
 +      constr->sblock[constr->nblocks++] = 3*c;
 +      while (iatom[1] >= cgs->index[cg+1])
 +      cg++;
 +    }
 +    iatom += 3;
 +  }
 +  constr->sblock[constr->nblocks] = 3*ncons;
 +}
 +
 +t_blocka make_at2con(int start,int natoms,
 +                   t_ilist *ilist,t_iparams *iparams,
 +                   gmx_bool bDynamics,int *nflexiblecons)
 +{
 +  int *count,ncon,con,con_tot,nflexcon,ftype,i,a;
 +  t_iatom  *ia;
 +  t_blocka at2con;
 +  gmx_bool bFlexCon;
 +  
 +  snew(count,natoms);
 +  nflexcon = 0;
 +  for(ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++) {
 +    ncon = ilist[ftype].nr/3;
 +    ia   = ilist[ftype].iatoms;
 +    for(con=0; con<ncon; con++) {
 +      bFlexCon = (iparams[ia[0]].constr.dA == 0 &&
 +                iparams[ia[0]].constr.dB == 0);
 +      if (bFlexCon)
 +      nflexcon++;
 +      if (bDynamics || !bFlexCon) {
 +      for(i=1; i<3; i++) {
 +        a = ia[i] - start;
 +        count[a]++;
 +      }
 +      }
 +      ia += 3;
 +    }
 +  }
 +  *nflexiblecons = nflexcon;
 +
 +  at2con.nr = natoms;
 +  at2con.nalloc_index = at2con.nr+1;
 +  snew(at2con.index,at2con.nalloc_index);
 +  at2con.index[0] = 0;
 +  for(a=0; a<natoms; a++) {
 +    at2con.index[a+1] = at2con.index[a] + count[a];
 +    count[a] = 0;
 +  }
 +  at2con.nra = at2con.index[natoms];
 +  at2con.nalloc_a = at2con.nra;
 +  snew(at2con.a,at2con.nalloc_a);
 +
 +  /* The F_CONSTRNC constraints have constraint numbers
 +   * that continue after the last F_CONSTR constraint.
 +   */
 +  con_tot = 0;
 +  for(ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++) {
 +    ncon = ilist[ftype].nr/3;
 +    ia   = ilist[ftype].iatoms;
 +    for(con=0; con<ncon; con++) {
 +      bFlexCon = (iparams[ia[0]].constr.dA == 0 &&
 +                iparams[ia[0]].constr.dB == 0);
 +      if (bDynamics || !bFlexCon) {
 +      for(i=1; i<3; i++) {
 +        a = ia[i] - start;
 +        at2con.a[at2con.index[a]+count[a]++] = con_tot;
 +      }
 +      }
 +      con_tot++;
 +      ia += 3;
 +    }
 +  }
 +  
 +  sfree(count);
 +
 +  return at2con;
 +}
 +
 +static int *make_at2settle(int natoms,const t_ilist *ilist)
 +{
 +    int *at2s;
 +    int a,stride,s;
 +  
 +    snew(at2s,natoms);
 +    /* Set all to no settle */
 +    for(a=0; a<natoms; a++)
 +    {
 +        at2s[a] = -1;
 +    }
 +
 +    stride = 1 + NRAL(F_SETTLE);
 +
 +    for(s=0; s<ilist->nr; s+=stride)
 +    {
 +        at2s[ilist->iatoms[s+1]] = s/stride;
 +        at2s[ilist->iatoms[s+2]] = s/stride;
 +        at2s[ilist->iatoms[s+3]] = s/stride;
 +    }
 +
 +    return at2s;
 +}
 +
 +void set_constraints(struct gmx_constr *constr,
 +                     gmx_localtop_t *top,t_inputrec *ir,
 +                     t_mdatoms *md,t_commrec *cr)
 +{
 +    t_idef *idef;
 +    int    ncons;
 +    t_ilist *settle;
 +    int    iO,iH;
 +    
 +    idef = &top->idef;
 +       
 +    if (constr->ncon_tot > 0)
 +    {
 +        /* We are using the local topology,
 +         * so there are only F_CONSTR constraints.
 +         */
 +        ncons = idef->il[F_CONSTR].nr/3;
 +        
 +        /* With DD we might also need to call LINCS with ncons=0 for
 +         * communicating coordinates to other nodes that do have constraints.
 +         */
 +        if (ir->eConstrAlg == econtLINCS)
 +        {
 +            set_lincs(idef,md,EI_DYNAMICS(ir->eI),cr,constr->lincsd);
 +        }
 +        if (ir->eConstrAlg == econtSHAKE)
 +        {
 +            if (cr->dd)
 +            {
 +                make_shake_sblock_dd(constr,&idef->il[F_CONSTR],&top->cgs,cr->dd);
 +            }
 +            else
 +            {
 +                make_shake_sblock_pd(constr,idef,md);
 +            }
 +            if (ncons > constr->lagr_nalloc)
 +            {
 +                constr->lagr_nalloc = over_alloc_dd(ncons);
 +                srenew(constr->lagr,constr->lagr_nalloc);
 +            }
 +        }
 +    }
 +
 +    if (idef->il[F_SETTLE].nr > 0 && constr->settled == NULL)
 +    {
 +        settle = &idef->il[F_SETTLE];
 +        iO = settle->iatoms[1];
 +        iH = settle->iatoms[2];
 +        constr->settled =
 +            settle_init(md->massT[iO],md->massT[iH],
 +                        md->invmass[iO],md->invmass[iH],
 +                        idef->iparams[settle->iatoms[0]].settle.doh,
 +                        idef->iparams[settle->iatoms[0]].settle.dhh);
 +    }
 +    
 +    /* Make a selection of the local atoms for essential dynamics */
 +    if (constr->ed && cr->dd)
 +    {
 +        dd_make_local_ed_indices(cr->dd,constr->ed);
 +    }
 +}
 +
 +static void constr_recur(t_blocka *at2con,
 +                       t_ilist *ilist,t_iparams *iparams,gmx_bool bTopB,
 +                       int at,int depth,int nc,int *path,
 +                       real r0,real r1,real *r2max,
 +                       int *count)
 +{
 +  int  ncon1;
 +  t_iatom *ia1,*ia2;
 +  int  c,con,a1;
 +  gmx_bool bUse;
 +  t_iatom *ia;
 +  real len,rn0,rn1;
 +
 +  (*count)++;
 +
 +  ncon1 = ilist[F_CONSTR].nr/3;
 +  ia1   = ilist[F_CONSTR].iatoms;
 +  ia2   = ilist[F_CONSTRNC].iatoms;
 +
 +  /* Loop over all constraints connected to this atom */
 +  for(c=at2con->index[at]; c<at2con->index[at+1]; c++) {
 +    con = at2con->a[c];
 +    /* Do not walk over already used constraints */
 +    bUse = TRUE;
 +    for(a1=0; a1<depth; a1++) {
 +      if (con == path[a1])
 +      bUse = FALSE;
 +    }
 +    if (bUse) {
 +      ia = constr_iatomptr(ncon1,ia1,ia2,con);
 +      /* Flexible constraints currently have length 0, which is incorrect */
 +      if (!bTopB)
 +      len = iparams[ia[0]].constr.dA;
 +      else
 +      len = iparams[ia[0]].constr.dB;
 +      /* In the worst case the bond directions alternate */
 +      if (nc % 2 == 0) {
 +      rn0 = r0 + len;
 +      rn1 = r1;
 +      } else {
 +      rn0 = r0;
 +      rn1 = r1 + len;
 +      }
 +      /* Assume angles of 120 degrees between all bonds */
 +      if (rn0*rn0 + rn1*rn1 + rn0*rn1 > *r2max) {
 +      *r2max = rn0*rn0 + rn1*rn1 + r0*rn1;
 +      if (debug) {
 +        fprintf(debug,"Found longer constraint distance: r0 %5.3f r1 %5.3f rmax %5.3f\n", rn0,rn1,sqrt(*r2max));
 +        for(a1=0; a1<depth; a1++)
 +          fprintf(debug," %d %5.3f",
 +                  path[a1],
 +                  iparams[constr_iatomptr(ncon1,ia1,ia2,con)[0]].constr.dA);
 +        fprintf(debug," %d %5.3f\n",con,len);
 +      }
 +      }
 +      /* Limit the number of recursions to 1000*nc,
 +       * so a call does not take more than a second,
 +       * even for highly connected systems.
 +       */
 +      if (depth + 1 < nc && *count < 1000*nc) {
 +      if (ia[1] == at)
 +        a1 = ia[2];
 +      else
 +        a1 = ia[1];
 +      /* Recursion */
 +      path[depth] = con;
 +      constr_recur(at2con,ilist,iparams,
 +                   bTopB,a1,depth+1,nc,path,rn0,rn1,r2max,count);
 +      path[depth] = -1;
 +      }
 +    }
 +  }
 +}
 +
 +static real constr_r_max_moltype(FILE *fplog,
 +                               gmx_moltype_t *molt,t_iparams *iparams,
 +                               t_inputrec *ir)
 +{
 +  int natoms,nflexcon,*path,at,count;
 +
 +  t_blocka at2con;
 +  real r0,r1,r2maxA,r2maxB,rmax,lam0,lam1;
 +
 +  if (molt->ilist[F_CONSTR].nr   == 0 &&
 +      molt->ilist[F_CONSTRNC].nr == 0) {
 +    return 0;
 +  }
 +  
 +  natoms = molt->atoms.nr;
 +
 +  at2con = make_at2con(0,natoms,molt->ilist,iparams,
 +                     EI_DYNAMICS(ir->eI),&nflexcon);
 +  snew(path,1+ir->nProjOrder);
 +  for(at=0; at<1+ir->nProjOrder; at++)
 +    path[at] = -1;
 +
 +  r2maxA = 0;
 +  for(at=0; at<natoms; at++) {
 +    r0 = 0;
 +    r1 = 0;
 +
 +    count = 0;
 +    constr_recur(&at2con,molt->ilist,iparams,
 +               FALSE,at,0,1+ir->nProjOrder,path,r0,r1,&r2maxA,&count);
 +  }
 +  if (ir->efep == efepNO) {
 +    rmax = sqrt(r2maxA);
 +  } else {
 +    r2maxB = 0;
 +    for(at=0; at<natoms; at++) {
 +      r0 = 0;
 +      r1 = 0;
 +      count = 0;
 +      constr_recur(&at2con,molt->ilist,iparams,
 +                 TRUE,at,0,1+ir->nProjOrder,path,r0,r1,&r2maxB,&count);
 +    }
 +    lam0 = ir->fepvals->init_lambda;
 +    if (EI_DYNAMICS(ir->eI))
 +      lam0 += ir->init_step*ir->fepvals->delta_lambda;
 +    rmax = (1 - lam0)*sqrt(r2maxA) + lam0*sqrt(r2maxB);
 +    if (EI_DYNAMICS(ir->eI)) {
 +      lam1 = ir->fepvals->init_lambda + (ir->init_step + ir->nsteps)*ir->fepvals->delta_lambda;
 +      rmax = max(rmax,(1 - lam1)*sqrt(r2maxA) + lam1*sqrt(r2maxB));
 +    }
 +  }
 +
 +  done_blocka(&at2con);
 +  sfree(path);
 +
 +  return rmax;
 +}
 +
 +real constr_r_max(FILE *fplog,gmx_mtop_t *mtop,t_inputrec *ir)
 +{
 +  int mt;
 +  real rmax;
 +
 +  rmax = 0;
 +  for(mt=0; mt<mtop->nmoltype; mt++) {
 +    rmax = max(rmax,
 +             constr_r_max_moltype(fplog,&mtop->moltype[mt],
 +                                  mtop->ffparams.iparams,ir));
 +  }
 +  
 +  if (fplog)
 +    fprintf(fplog,"Maximum distance for %d constraints, at 120 deg. angles, all-trans: %.3f nm\n",1+ir->nProjOrder,rmax);
 +
 +  return rmax;
 +}
 +
 +gmx_constr_t init_constraints(FILE *fplog,
 +                              gmx_mtop_t *mtop,t_inputrec *ir,
 +                              gmx_edsam_t ed,t_state *state,
 +                              t_commrec *cr)
 +{
 +    int  ncon,nset,nmol,settle_type,i,natoms,mt,nflexcon;
 +    struct gmx_constr *constr;
 +    char *env;
 +    t_ilist *ilist;
 +    gmx_mtop_ilistloop_t iloop;
 +    
 +    ncon =
 +        gmx_mtop_ftype_count(mtop,F_CONSTR) +
 +        gmx_mtop_ftype_count(mtop,F_CONSTRNC);
 +    nset = gmx_mtop_ftype_count(mtop,F_SETTLE);
 +    
 +    if (ncon+nset == 0 && ir->ePull != epullCONSTRAINT && ed == NULL) 
 +    {
 +        return NULL;
 +    }
 +    
 +    snew(constr,1);
 +    
 +    constr->ncon_tot = ncon;
 +    constr->nflexcon = 0;
 +    if (ncon > 0) 
 +    {
 +        constr->n_at2con_mt = mtop->nmoltype;
 +        snew(constr->at2con_mt,constr->n_at2con_mt);
 +        for(mt=0; mt<mtop->nmoltype; mt++) 
 +        {
 +            constr->at2con_mt[mt] = make_at2con(0,mtop->moltype[mt].atoms.nr,
 +                                                mtop->moltype[mt].ilist,
 +                                                mtop->ffparams.iparams,
 +                                                EI_DYNAMICS(ir->eI),&nflexcon);
 +            for(i=0; i<mtop->nmolblock; i++) 
 +            {
 +                if (mtop->molblock[i].type == mt) 
 +                {
 +                    constr->nflexcon += mtop->molblock[i].nmol*nflexcon;
 +                }
 +            }
 +        }
 +        
 +        if (constr->nflexcon > 0) 
 +        {
 +            if (fplog) 
 +            {
 +                fprintf(fplog,"There are %d flexible constraints\n",
 +                        constr->nflexcon);
 +                if (ir->fc_stepsize == 0) 
 +                {
 +                    fprintf(fplog,"\n"
 +                            "WARNING: step size for flexible constraining = 0\n"
 +                            "         All flexible constraints will be rigid.\n"
 +                            "         Will try to keep all flexible constraints at their original length,\n"
 +                            "         but the lengths may exhibit some drift.\n\n");
 +                    constr->nflexcon = 0;
 +                }
 +            }
 +            if (constr->nflexcon > 0) 
 +            {
 +                please_cite(fplog,"Hess2002");
 +            }
 +        }
 +        
 +        if (ir->eConstrAlg == econtLINCS) 
 +        {
 +            constr->lincsd = init_lincs(fplog,mtop,
 +                                        constr->nflexcon,constr->at2con_mt,
 +                                        DOMAINDECOMP(cr) && cr->dd->bInterCGcons,
 +                                        ir->nLincsIter,ir->nProjOrder);
 +        }
 +        
 +        if (ir->eConstrAlg == econtSHAKE) {
 +            if (DOMAINDECOMP(cr) && cr->dd->bInterCGcons)
 +            {
 +                gmx_fatal(FARGS,"SHAKE is not supported with domain decomposition and constraint that cross charge group boundaries, use LINCS");
 +            }
 +            if (constr->nflexcon) 
 +            {
 +                gmx_fatal(FARGS,"For this system also velocities and/or forces need to be constrained, this can not be done with SHAKE, you should select LINCS");
 +            }
 +            please_cite(fplog,"Ryckaert77a");
 +            if (ir->bShakeSOR) 
 +            {
 +                please_cite(fplog,"Barth95a");
 +            }
 +
 +            constr->shaked = shake_init();
 +        }
 +    }
 +  
 +    if (nset > 0) {
 +        please_cite(fplog,"Miyamoto92a");
 +
 +        constr->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +        /* Check that we have only one settle type */
 +        settle_type = -1;
 +        iloop = gmx_mtop_ilistloop_init(mtop);
 +        while (gmx_mtop_ilistloop_next(iloop,&ilist,&nmol)) 
 +        {
 +            for (i=0; i<ilist[F_SETTLE].nr; i+=4) 
 +            {
 +                if (settle_type == -1) 
 +                {
 +                    settle_type = ilist[F_SETTLE].iatoms[i];
 +                } 
 +                else if (ilist[F_SETTLE].iatoms[i] != settle_type) 
 +                {
 +                    gmx_fatal(FARGS,
 +                              "The [molecules] section of your topology specifies more than one block of\n"
 +                              "a [moleculetype] with a [settles] block. Only one such is allowed. If you\n"
 +                              "are trying to partition your solvent into different *groups* (e.g. for\n"
 +                              "freezing, T-coupling, etc.) then you are using the wrong approach. Index\n"
 +                              "files specify groups. Otherwise, you may wish to change the least-used\n"
 +                              "block of molecules with SETTLE constraints into 3 normal constraints.");
 +                }
 +            }
 +        }
 +
 +        constr->n_at2settle_mt = mtop->nmoltype;
 +        snew(constr->at2settle_mt,constr->n_at2settle_mt);
 +        for(mt=0; mt<mtop->nmoltype; mt++) 
 +        {
 +            constr->at2settle_mt[mt] =
 +                make_at2settle(mtop->moltype[mt].atoms.nr,
 +                               &mtop->moltype[mt].ilist[F_SETTLE]);
 +        }
 +    }
 +    
 +    constr->maxwarn = 999;
 +    env = getenv("GMX_MAXCONSTRWARN");
 +    if (env) 
 +    {
 +        constr->maxwarn = 0;
 +        sscanf(env,"%d",&constr->maxwarn);
 +        if (fplog) 
 +        {
 +            fprintf(fplog,
 +                    "Setting the maximum number of constraint warnings to %d\n",
 +                    constr->maxwarn);
 +        }
 +        if (MASTER(cr)) 
 +        {
 +            fprintf(stderr,
 +                    "Setting the maximum number of constraint warnings to %d\n",
 +                    constr->maxwarn);
 +        }
 +    }
 +    if (constr->maxwarn < 0 && fplog) 
 +    {
 +        fprintf(fplog,"maxwarn < 0, will not stop on constraint errors\n");
 +    }
 +    constr->warncount_lincs  = 0;
 +    constr->warncount_settle = 0;
 +    
 +    /* Initialize the essential dynamics sampling.
 +     * Put the pointer to the ED struct in constr */
 +    constr->ed = ed;
 +    if (ed != NULL) 
 +    {
 +        init_edsam(mtop,ir,cr,ed,state->x,state->box);
 +    }
 +    
 +    constr->warn_mtop = mtop;
 +    
 +    return constr;
 +}
 +
 +const t_blocka *atom2constraints_moltype(gmx_constr_t constr)
 +{
 +  return constr->at2con_mt;
 +}
 +
 +const int **atom2settle_moltype(gmx_constr_t constr)
 +{
 +    return (const int **)constr->at2settle_mt;
 +}
 +
 +
 +gmx_bool inter_charge_group_constraints(const gmx_mtop_t *mtop)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_block *cgs;
 +    const t_ilist *il;
 +    int  mb;
 +    int  nat,*at2cg,cg,a,ftype,i;
 +    gmx_bool bInterCG;
 +
 +    bInterCG = FALSE;
 +    for(mb=0; mb<mtop->nmolblock && !bInterCG; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +
 +        if (molt->ilist[F_CONSTR].nr   > 0 ||
 +            molt->ilist[F_CONSTRNC].nr > 0 ||
 +            molt->ilist[F_SETTLE].nr > 0)
 +        {
 +            cgs  = &molt->cgs;
 +            snew(at2cg,molt->atoms.nr);
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
 +                    at2cg[a] = cg;
 +            }
 +
 +            for(ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++)
 +            {
 +                il = &molt->ilist[ftype];
 +                for(i=0; i<il->nr && !bInterCG; i+=1+NRAL(ftype))
 +                {
 +                    if (at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+2]])
 +                    {
 +                        bInterCG = TRUE;
 +                    }
 +                }
 +            }
 +            
 +            sfree(at2cg);
 +        }
 +    }
 +
 +    return bInterCG;
 +}
 +
 +gmx_bool inter_charge_group_settles(const gmx_mtop_t *mtop)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_block *cgs;
 +    const t_ilist *il;
 +    int  mb;
 +    int  nat,*at2cg,cg,a,ftype,i;
 +    gmx_bool bInterCG;
 +
 +    bInterCG = FALSE;
 +    for(mb=0; mb<mtop->nmolblock && !bInterCG; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +
 +        if (molt->ilist[F_SETTLE].nr > 0)
 +        {
 +            cgs  = &molt->cgs;
 +            snew(at2cg,molt->atoms.nr);
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
 +                    at2cg[a] = cg;
 +            }
 +
 +            for(ftype=F_SETTLE; ftype<=F_SETTLE; ftype++)
 +            {
 +                il = &molt->ilist[ftype];
 +                for(i=0; i<il->nr && !bInterCG; i+=1+NRAL(F_SETTLE))
 +                {
 +                    if (at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+2]] ||
 +                        at2cg[il->iatoms[i+1]] != at2cg[il->iatoms[i+3]])
 +                    {
 +                        bInterCG = TRUE;
 +                    }
 +                }
 +            }       
 +            
 +            sfree(at2cg);
 +        }
 +    }
 +
 +    return bInterCG;
 +}
 +
 +/* helper functions for andersen temperature control, because the
 + * gmx_constr construct is only defined in constr.c. Return the list
 + * of blocks (get_sblock) and the number of blocks (get_nblocks).  */
 +
 +extern int *get_sblock(struct gmx_constr *constr)
 +{
 +    return constr->sblock;
 +}
 +
 +extern int get_nblocks(struct gmx_constr *constr)
 +{
 +    return constr->nblocks;
 +}
Simple merge
index 80c4b5c14c29e24244462c4da999dc461e2585e6,0000000000000000000000000000000000000000..d81dd4afbb72bdd539dfcb67e9fbd06b0966dbe7
mode 100644,000000..100644
--- /dev/null
@@@ -1,9544 -1,0 +1,9550 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#define DDRANK(dd,rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int  *ncg;     /* Number of home charge groups for each node */
 +    int  *index;   /* Index of nnodes+1 into cg */
 +    int  *cg;      /* Global charge group index */
 +    int  *nat;     /* Number of home atoms for each node. */
 +    int  *ibuf;    /* Buffer for communication */
 +    rvec *vbuf;    /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int nsend[DD_MAXIZONE+2];
 +    int nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int nalloc;
 +    /* The atom range for non-in-place communication */
 +    int cell2at0[DD_MAXIZONE];
 +    int cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int  np;                   /* Number of grid pulses in this dimension */
 +    int  np_dlb;               /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
 +    int  np_nalloc;
 +    gmx_bool bInPlace;             /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 +    real *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int  nload;
 +    float *load;
 +    float sum;
 +    float max;
 +    float sum_m;
 +    float cvol_min;
 +    float mdf;
 +    float pme;
 +    int   flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int  sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int  sort_new_nalloc;
 +    int  *ibuf;
 +    int  ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int  nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 +
 +enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int  dim;      /* The dimension                                          */
 +    gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 +    int  nslab;    /* The number of PME slabs in this dimension              */
 +    real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int  *pp_min;  /* The minimum pp node location, size nslab               */
 +    int  *pp_max;  /* The maximum pp node location,size nslab                */
 +    int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int *ibuf;
 +    int ibuf_nalloc;
 +    vec_rvec_t vbuf;
 +    int nsend;
 +    int nat;
 +    int nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int  npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int  npmenodes;
 +    int  npmenodes_x;
 +    int  npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool bCartesianPP_PME;
 +    ivec ntot;
 +    int  cartpmedim;
 +    int  *pmenodes;          /* size npmenodes                         */
 +    int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                              * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +    
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +    
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int  nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +    
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool bBondComm;
 +    t_blocka *cglink;
 +    char *bLocalCG;
 +
 +    /* The DLB option */
 +    int  eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +    
 +    /* The width of the communicated boundaries */
 +    real cutoff_mbody;
 +    real cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +    
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +    
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +    
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int  maxpulse;
 +    
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +    
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +    
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int  moved_nalloc;
 +    
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int  nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int  *buf_int2;
 +    int  nalloc_int2;
 +    vec_rvec_t vbuf2;
 +    
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int  cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int  cgcm_state_nalloc[DIM*2];
 +    
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real *cell_f_row;
 +    real cell_f0[DIM];
 +    real cell_f1[DIM];
 +    real cell_f_max0[DIM];
 +    real cell_f_min1[DIM];
 +    
 +    /* Stuff for load communication */
 +    gmx_bool bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float cycl[ddCyclNr];
 +    int   cycl_n[ddCyclNr];
 +    float cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +    
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +  {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +#define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +static void index2xyz(ivec nc,int ind,ivec xyz)
 +{
 +  xyz[XX] = ind % nc[XX];
 +  xyz[YY] = (ind / nc[XX]) % nc[YY];
 +  xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +}
 +*/
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid=-1;
 +    
 +    ddindex = dd_index(dd->nc,c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +    
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd,int i)
 +{
 +    int atnr;
 +    
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +    
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v,v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd,t_state *state)
 +{
 +    int i;
 +    
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +    
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl,state->cg_gl_nalloc);
 +    }
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +    
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 +                      int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int izone,d,dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +    
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg,izone,zones->nizone);
 +    }
 +        
 +    *jcg1 = zones->izone[izone].jcg1;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec shift={0,0,0},*buf,*rbuf;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = comm->vbuf.v;
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]],shift);
 +        }
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        copy_rvec(x[j],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j],shift,buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j],x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec *buf,*sbuf;
 +    ivec vis;
 +    int  is;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is = IVEC2IS(vis);
 +        
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i],sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        n++;
 +                    }
 +                } 
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is],buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*rbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*sbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            } 
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 +{
 +    fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d,i,j,
 +            zone->min0,zone->max1,
 +            zone->mch0,zone->mch0,
 +            zone->p1_0,zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind,int direction,
 +                               gmx_ddzone_t *buf_s,int n_s,
 +                               gmx_ddzone_t *buf_r,int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int i;
 +
 +    for(i=0; i<n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for(i=0; i<n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0,rvec cell_ns_x1)
 +{
 +    int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 +    gmx_ddzone_t *zp;
 +    gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
 +    rvec extr_s[2],extr_r[2];
 +    rvec dh;
 +    real dist_d,c=0,det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bPBC,bUse;
 +
 +    comm = dd->comm;
 +
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +    
 +    for(d=dd->ndim-2; d>=0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for(d1=d; d1<dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse,dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for(p=0; p<npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for(p=0; p<npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for(d1=d+1; d1<dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for(i=0; i<buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */ 
 +                pos = 0;
 +
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for(i=d; i<2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for(i=0; i<2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0; j<2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state *state_local)
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
 +    t_block *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +    
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    } 
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for(i=0; i<ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +    
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd,2*sizeof(int),buf2,ibuf);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ncg[i] = ma->ibuf[2*i];
 +            ma->nat[i] = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +            
 +        }
 +        /* Make byte counts and indices */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"Initial charge group distribution: ");
 +            for(i=0; i<dd->nnodes; i++)
 +                fprintf(debug," %d",ma->ncg[i]);
 +            fprintf(debug,"\n");
 +        }
 +    }
 +    
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int),dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +    
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +
 +    ma = dd->ma;
 +    
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 dd->rank,dd->mpi_comm_all);
 +#endif
 +    } else {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++],v[c]);
 +            }
 +        }
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
 +                         n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++],v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts,int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int n;
 +
 +    ma = dd->ma;
 +    
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for(n=0; n<dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *rcounts=NULL,*disps=NULL;
 +    int  n,i,c,a;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +    
 +    ma = dd->ma;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd,&rcounts,&disps);
 +
 +        buf = ma->vbuf;
 +    }
 +    
 +    dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++],v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local,rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    dd_collect_cg(dd,state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd,lv,v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd,lv,v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local,t_state *state)
 +{
 +    int est,i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i=0;i<efptNR;i++) {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta = state_local->veta;
 +        state->vol0 = state_local->vol0;
 +        copy_mat(state_local->box,state->box);
 +        copy_mat(state_local->boxv,state->boxv);
 +        copy_mat(state_local->svir_prev,state->svir_prev);
 +        copy_mat(state_local->fvir_prev,state->fvir_prev);
 +        copy_mat(state_local->pres_prev,state->pres_prev);
 +
 +
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];            
 +        }
 +        for(i=0; i<state_local->nnhpres; i++) 
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est) {
 +            case estX:
 +                dd_collect_vec(dd,state_local,state_local->x,state->x);
 +                break;
 +            case estV:
 +                dd_collect_vec(dd,state_local,state_local->v,state->v);
 +                break;
 +            case estSDX:
 +                dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
 +                break;
 +            case estCGP:
 +                dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    if (DDMASTER(dd))
 +                    {
 +                        for(i=0; i<state_local->nrng; i++)
 +                        {
 +                            state->ld_rng[i] = state_local->ld_rng[i];
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
 +                              state_local->ld_rng,state->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                   if (DDMASTER(dd))
 +                    {
 +                        state->ld_rngi[0] = state_local->ld_rngi[0];
 +                    } 
 +                }
 +                else
 +                {
 +                    dd_gather(dd,sizeof(state->ld_rngi[0]),
 +                              state_local->ld_rngi,state->ld_rngi);
 +                }
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +    
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch(est) {
 +            case estX:
 +                srenew(state->x,state->nalloc);
 +                break;
 +            case estV:
 +                srenew(state->v,state->nalloc);
 +                break;
 +            case estSDX:
 +                srenew(state->sd_X,state->nalloc);
 +                break;
 +            case estCGP:
 +                srenew(state->cg_p,state->nalloc);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No reallocation required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_realloc_state");            
 +            }
 +        }
 +    }
 +    
 +    if (f != NULL)
 +    {
 +        srenew(*f,state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo,fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm,fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state,f,nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c],buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
 +                              a,ma->nat[n]);
 +                }
 +                
 +#ifdef GMX_MPI
 +                MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
 +                         DDRANK(dd,n),n,dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c],lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *scounts=NULL,*disps=NULL;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +     
 +        get_commbuffer_counts(dd,&scounts,&disps);
 +
 +        buf = ma->vbuf;
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c],buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd,cgs,v,lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd,cgs,v,lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
 +                                t_state *state,t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for(i=0;i<efptNR;i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta   = state->veta;
 +        state_local->vol0   = state->vol0;
 +        copy_mat(state->box,state_local->box);
 +        copy_mat(state->box_rel,state_local->box_rel);
 +        copy_mat(state->boxv,state_local->boxv);
 +        copy_mat(state->svir_prev,state_local->svir_prev);
 +        copy_mat(state->fvir_prev,state_local->fvir_prev);
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for(i=0; i<state_local->nnhpres; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
 +    dd_bcast(dd,sizeof(int),&state_local->fep_state);
 +    dd_bcast(dd,sizeof(real),&state_local->veta);
 +    dd_bcast(dd,sizeof(real),&state_local->vol0);
 +    dd_bcast(dd,sizeof(state_local->box),state_local->box);
 +    dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
 +    dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
 +    dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
 +    dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
 +    dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,dd->nat_home);
 +    }
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i) {
 +            case estX:
 +                dd_distribute_vec(dd,cgs,state->x,state_local->x);
 +                break;
 +            case estV:
 +                dd_distribute_vec(dd,cgs,state->v,state_local->v);
 +                break;
 +            case estSDX:
 +                dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
 +                break;
 +            case estCGP:
 +                dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,
 +                              state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                              state->ld_rng,state_local->ld_rng);
 +                }
 +                else
 +                {
 +                    dd_scatter(dd,
 +                               state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                               state->ld_rng,state_local->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
 +                              state->ld_rngi,state_local->ld_rngi);
 +                }
 +                else
 +                {
 +                     dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
 +                               state->ld_rngi,state_local->ld_rngi);
 +                }   
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* Not implemented yet */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c='?';
 +    
 +    switch (dim)
 +    {
 +    case XX: c = 'X'; break;
 +    case YY: c = 'Y'; break;
 +    case ZZ: c = 'Z'; break;
 +    default: gmx_fatal(FARGS,"Unknown dim %d",dim);
 +    }
 +    
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
 +                              gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
 +{
 +    rvec grid_s[2],*grid_r=NULL,cx,r;
 +    char fname[STRLEN],format[STRLEN],buf[22];
 +    FILE *out;
 +    int  a,i,d,z,y,x;
 +    matrix tric;
 +    real vol;
 +
 +    copy_rvec(dd->comm->cell_x0,grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1,grid_s[1]);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r,2*dd->nnodes);
 +    }
 +    
 +    dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            for(i=0; i<DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
 +        sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname,"w");
 +        gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +        a = 1;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for(d=0; d<DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for(z=0; z<2; z++)
 +            {
 +                for(y=0; y<2; y++)
 +                {
 +                    for(x=0; x<2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric,cx,r);
 +                        fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
 +                                10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
 +                    }
 +                }
 +            }
 +            for(d=0; d<DIM; d++)
 +            {
 +                for(x=0; x<4; x++)
 +                {
 +                    switch(d)
 +                    {
 +                    case 0: y = 1 + i*8 + 2*x; break;
 +                    case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                    case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
 +                  gmx_mtop_t *mtop,t_commrec *cr,
 +                  int natoms,rvec x[],matrix box)
 +{
 +    char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
 +    FILE *out;
 +    int  i,ii,resnr,c;
 +    char *atomname,*resname;
 +    real b;
 +    gmx_domdec_t *dd;
 +    
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +    
 +    sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
 +    
 +    sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +    sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
 +    
 +    out = gmx_fio_fopen(fname,"w");
 +    
 +    fprintf(out,"TITLE     %s\n",title);
 +    gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +    for(i=0; i<natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out,strlen(atomname)<4 ? format : format4,
 +                "ATOM",(ii+1)%100000,
 +                atomname,resname,' ',resnr%10000,' ',
 +                10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
 +    }
 +    fprintf(out,"TER\n");
 +    
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  di;
 +    real r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for(di=1; di<dd->ndim; di++)
 +            {
 +                r = min(r,comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r,comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r,comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff,r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
 +{
 +    int nc,ntot;
 +    
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord,coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int n,i,p0,p1;
 +    
 +    snew(pmenodes,cr->npmenodes);
 +    n = 0;
 +    for(i=0; i<cr->dd->nnodes; i++) {
 +        p0 = cr_ddindex2pmeindex(cr,i);
 +        p1 = cr_ddindex2pmeindex(cr,i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0) {
 +            if (debug)
 +                fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec coords,coords_pme,nc;
 +    int  slab;
 +    
 +    dd = cr->dd;
 +    /*
 +      if (dd->comm->bCartesian) {
 +      gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +      dd_coords2pmecoords(dd,coords,coords_pme);
 +      copy_ivec(dd->ntot,nc);
 +      nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +      coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +      
 +      slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +      } else {
 +      slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +      }
 +    */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
 +    
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec coords;
 +    int  ddindex,nodeid=-1;
 +    
 +    comm = cr->dd->comm;
 +    
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc,coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +  
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec coord,coord_pme;
 +    int  i;
 +    int  pmenode=-1;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +    
 +    return pmenode;
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +    
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
 +                     int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int x,y,z;
 +    ivec coord,coord_pme;
 +    
 +    dd = cr->dd;
 +    
 +    snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +    
 +    *nmy_ddnodes = 0;
 +    for(x=0; x<dd->nc[XX]; x++)
 +    {
 +        for(y=0; y<dd->nc[YY]; y++)
 +        {
 +            for(z=0; z<dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Receive coordinates from PP nodes:");
 +        for(x=0; x<*nmy_ddnodes; x++)
 +        {
 +            fprintf(debug," %d",(*my_ddnodes)[x]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  pmenode,coords[DIM],rank;
 +    gmx_bool bReceive;
 +    
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
 +                if (dd_simnode2pmenode(cr,rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif  
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +    
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for(i=1; i<zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index,t_state *state)
 +{
 +    int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
 +    
 +    ind = state->cg_gl;
 +    dd_cg_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    nat = 0;
 +    cgindex[0] = nat;
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        cgindex[i] = nat;
 +        cg_gl = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +    
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
 +                          t_forcerec *fr,char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int *cginfo;
 +    int cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index,int cg_start)
 +{
 +    int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
 +    int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char *bLocalCG;
 +    gmx_bool bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex,dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +    
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la,a_gl,a,zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la,cg_gl,a,zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg,i,ngl,nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for(i=0; i<dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for(i=0; i<ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys,int ncg_sys,
 +                                    const char *where)
 +{
 +    int  nerr,ngl,i,a,cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have,natoms_sys);
 +        for(a=0; a<dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have,dd->nat_tot);
 +
 +    ngl  = 0;
 +    for(i=0; i<natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la,i,&a,&cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank,where,ngl,dd->nat_tot);
 +    }
 +    for(a=0; a<dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank,where,a+1,dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
 +
 +    if (nerr > 0) {
 +        gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank,where,nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
 +{
 +    int  i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for(i=a_start; i<dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la,dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for(i=cg_start; i<dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +    
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t *dd,
 +                                real cutoff,
 +                                gmx_ddbox_t *ddbox,
 +                                gmx_bool bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim;
 +    real limit,bfac;
 +    gmx_bool bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +    
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        limit = grid_jump_limit(comm,cutoff,d);
 +        bfac = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +            (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step,buf),
 +                          dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +    
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    } 
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +    
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int i;
 +    
 +    comm = dd->comm;
 +    
 +    snew(*dim_f,dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for(i=1; i<dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
 +{
 +    int        pmeindex,slab,nso,i;
 +    ivec xyz;
 +    
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +    
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min,ddpme->nslab);
 +    snew(ddpme->pp_max,ddpme->nslab);
 +    for(slab=0; slab<ddpme->nslab; slab++) {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for(i=0; i<dd->nnodes; i++) {
 +        ddindex2xyz(dd->nc,i,xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
 +            pmeindex = ddindex2pmeindex(dd,i);
 +            if (dimind == 0) {
 +                slab = pmeindex/nso;
 +            } else {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  nc,ns,s;
 +    int  *xmin,*xmax;
 +    real range,pme_boundary;
 +    int  sh;
 +    
 +    comm = dd->comm;
 +    nc  = dd->nc[ddpme->dim];
 +    ns  = ddpme->nslab;
 +    
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +        
 +        sh = 1;
 +        for(s=0; s<ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +    
 +    ddpme->maxshift = sh;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"PME slab communication range for dim %d is %d\n",
 +                ddpme->dim,ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d,dim;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                      dd->nc[dim],dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster,ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,j;
 +    rvec cellsize_min;
 +    real *cell_x,cell_dx,cellsize;
 +    
 +    comm = dd->comm;
 +    
 +    for(d=0; d<DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d] = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for(j=0; j<dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x,dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for(j=0; j<dd->nc[d]; j++)
 +            {
 +                cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d],cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d],dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +    
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min,comm->cellsize_min);
 +    }
 +   
 +    for(d=0; d<comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd,&comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]]==NULL,ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,
 +                                       gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,i,j,nmin,nmin_old;
 +    gmx_bool bLimLo,bLimHi;
 +    real *cell_size;
 +    real fac,halfway,cellsize_limit_f_i,region_size;
 +    gmx_bool bPBC,bLastHi=FALSE;
 +    int nrange[]={range[0],range[1]};
 +
 +    region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug) 
 +    {
 +        fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for(i=range[0]; i<range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i] = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +    
 +    i=range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step,buf),
 +                  dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                  ncd,comm->cellsize_min[dim]);
 +    }
 +    
 +    root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
 +    
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for(i=range[0]+1; i<range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i+1; j<range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i-1; j>=range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for(i=range[0]; i<range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for(i=range[0]+1; i<range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]=range[0];
 +                    nrange[1]=i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi=FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi=TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0]=nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]=i; 
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0]=nrange[1];
 +                nrange[1]=range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            } 
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,d1,i,j,pos;
 +    real *cell_size;
 +    real load_aver,load_i,imbalance,change,change_max,sc;
 +    real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
 +    real change_limit;
 +    real relax = 0.5;
 +    gmx_bool bPBC;
 +    int range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for(i=0; i<ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform) {
 +        for(i=0; i<ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -relax*imbalance;
 +            change_max = max(change_max,max(change,-change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +    
 +    cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for(i=1; i<ncd; i++) {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0) {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0) {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d,i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i],root->cell_f[i],root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]=ncd;
 +    root->cell_f[0] = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for(i=0; i<ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim,i,root->cell_f[i],root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step,buf),dim2char(dim),i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +    
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for(d1=0; d1<d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +    
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}    
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox,int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d,int dim,real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d1,dim1,pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
 +              0,comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for(d1=0; d1<=d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd,ddbox,d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d,dim,d1;
 +    gmx_bool bRowMember,bRowRoot;
 +    real *cell_f_row;
 +    
 +    comm = dd->comm;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot = TRUE;
 +        for(d1=d; d1<dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
 +                                           ddbox,bDynamicBox,bUniform,step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
 +        }
 +    }
 +}    
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd,ddbox,d); 
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +    
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle,ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
 +        wallcycle_stop(wcycle,ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd,ddbox);
 +    }
 +    
 +    /* Set the dimensions for which no DD is used */
 +    for(dim=0; dim<DIM; dim++) {
 +        if (dd->nc[dim] == 1) {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
 +{
 +    int d,np,i;
 +    gmx_domdec_comm_dim_t *cd;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]),np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
 +            }
 +            srenew(cd->ind,np);
 +            for(i=cd->np_nalloc; i<np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                              gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec npulse;
 +    
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0,comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1,comm->old_cell_x1);
 +    
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd,ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
 +        realloc_comm_ind(dd,npulse);
 +    }
 +    
 +    if (debug)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
 +                    d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0,rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim_ind,dim;
 +    
 +    comm = dd->comm;
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim && 
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step,buf),dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +    }
 +    
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +    
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog,gmx_large_int_t step,
 +                          matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int **tmp_ind=NULL,*tmp_nalloc=NULL;
 +    int  i,icg,j,k,k0,k1,d,npbcdim;
 +    matrix tcm;
 +    rvec box_size,cg_cm;
 +    ivec ind;
 +    real nrcg,inv_ncg,pos_d;
 +    atom_id *cgindex;
 +    gmx_bool bUnbounded,bScrew;
 +
 +    ma = dd->ma;
 +    
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc,dd->nnodes);
 +        snew(tmp_ind,dd->nnodes);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +    }
 +    
 +    /* Clear the count */
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +    
 +    make_tric_corr_matrix(dd->npbcdim,box,tcm);
 +    
 +    cgindex = cgs->index;
 +    
 +    /* Compute the center of geometry for all charge groups */
 +    for(icg=0; icg<cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0],cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cg_cm);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cg_cm,pos[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for(d=DIM-1; d>=0; d--) {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for(j=d+1; j<DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while(pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(pos[k],box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while(pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(pos[k],box[d]);
 +                        if (bScrew) {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc,ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +    
 +    k1 = 0;
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for(k=0; k<ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +    
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +    
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog,"Charge group distribution at step %s:",
 +                gmx_step_str(step,buf));
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            fprintf(fplog," %d",ma->ncg[i]);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
 +                                t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    ivec npulse;
 +    int  i,cg_gl;
 +    int  *ibuf,buf2[2] = { 0, 0 };
 +    gmx_bool bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +        
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +    
 +        set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
 +    
 +        distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd,2*sizeof(int),ibuf,buf2);
 +    
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl,dd->cg_nalloc);
 +        srenew(dd->cgindex,dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +    
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int),dd->index_gl);
 +    
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for(i=0; i<dd->ncg_home; i++)
 +    {
 +        cg_gl = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Home charge groups:\n");
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fprintf(debug," %d",dd->index_gl[i]);
 +            if (i % 10 == 9) 
 +                fprintf(debug,"\n");
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,int vec,
 +                                   rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for(i=i0; i<i1; i++)
 +                {
 +                    copy_rvec(src[i],src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for(i=i0; i<i1; i++)
 +            {
 +                copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +    
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg],src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg,int *move,
 +                       int *index_gl,int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la,char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg,nat,a0,a1,a,a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for(a=a0; a<a1; a++)
 +            {
 +                a_gl = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la,a_gl,nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +    
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg,int *move,
 +                               int *index_gl,int *cgindex,int *gatindex,
 +                               gmx_ga2la_t ga2la,char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg,a0,a1,a;
 +    
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
 +    }
 +    fprintf(fplog,"distance out of cell %f\n",
 +            dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX],cm_old[YY],cm_old[ZZ]);
 +    }
 +    fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX],cm_new[YY],cm_new[ZZ]);
 +    fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
 +    fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim],comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd,step,cg,dim,dir,
 +                      bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    }
 +    print_cg_move(stderr,dd,step,cg,dim,dir,
 +                  bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state,int a)
 +{
 +    int est;
 +
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est))) {
 +            switch (est) {
 +            case estX:
 +                /* Rotate the complete state; for a rectangular box only */
 +                state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                break;
 +            case estV:
 +                state->v[a][YY] = -state->v[a][YY];
 +                state->v[a][ZZ] = -state->v[a][ZZ];
 +                break;
 +            case estSDX:
 +                state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                break;
 +            case estCGP:
 +                state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* These are distances, so not affected by rotation */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in rotate_state_atom");            
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved,comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir,matrix tcm,
 +                         rvec cell_x0,rvec cell_x1,
 +                         rvec limitd,rvec limit0,rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start,int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int  npbcdim;
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  flag;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    rvec cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(cg=cg_start; cg<cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0],cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cm_new);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cm_new,state->x[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +        
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for(d=DIM-1; d>=0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for(d2=d+1; d2<DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_dec(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_inc(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(state->x[k],state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(state->x[k],state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +    
 +        copy_rvec(cm_new,cg_cm[cg]);
 +        
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc = -1;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1) {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
 +                               gmx_domdec_t *dd,ivec tric_dir,
 +                               t_state *state,rvec **f,
 +                               t_forcerec *fr,t_mdatoms *md,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int  *move;
 +    int  npbcdim;
 +    int  ncg[DIM*2],nat[DIM*2];
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  sbuf[2],rbuf[2];
 +    int  home_pos_cg,home_pos_at,buf_pos;
 +    int  flag;
 +    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    matrix tcm;
 +    rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
 +    atom_id *cgindex;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int  *moved;
 +    int  nthread,thread;
 +    
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +    
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +    
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +            case estX:   /* Always present */            break;
 +            case estV:   bV   = (state->flags & (1<<i)); break;
 +            case estSDX: bSDX = (state->flags & (1<<i)); break;
 +            case estCGP: bCGP = (state->flags & (1<<i)); break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No processing required */
 +                break;
 +            default:
 +            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +    
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int,comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +    
 +    /* Clear the count */
 +    for(c=0; c<dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(d=0; (d<DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +    
 +    make_tric_corr_matrix(npbcdim,state->box,tcm);
 +    
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
 +                     cell_x0,cell_x1,limitd,limit0,limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for(cg=0; cg<dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +    
 +    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for(i=0; i<dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +    
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +    
 +    /* Make sure the communication buffers are large enough */
 +    for(mc=0; mc<dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +    
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        /* Recalculating cg_cm might be cheaper than communicating,
 +         * but that could give rise to rounding issues.
 +         */
 +        home_pos_cg =
 +            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                    nvec,cg_cm,comm,bCompact);
 +    break;
 +    case ecutsVERLET:
 +        /* Without charge groups we send the moved atom coordinates
 +         * over twice. This is so the code below can be used without
 +         * many conditionals for both for with and without charge groups.
 +         */
 +        home_pos_cg =
 +            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                    nvec,state->x,comm,FALSE);
 +        if (bCompact)
 +        {
 +            home_pos_cg -= *ncg_moved;
 +        }
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        home_pos_cg = 0;
 +    }
 +    
 +    vec = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->x,comm,bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->v,comm,bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->sd_X,comm,bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->cg_p,comm,bCompact);
 +    }
 +    
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home,move,
 +                    dd->index_gl,dd->cgindex,dd->gatindex,
 +                    dd->ga2la,comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm,dd->ncg_home);
 +
 +            for(k=0; k<dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home,move,
 +                           dd->index_gl,dd->cgindex,dd->gatindex,
 +                           dd->ga2la,comm->bLocalCG,
 +                           moved);
 +    }
 +    
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d,dir,sbuf[0],sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +            
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int,comm->nalloc_int);
 +            }
 +            
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +            
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf,nvr+i);
 +            
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +        
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for(cg=0; cg<ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog,dd,step,cg,dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                   FALSE,0,
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for(d3=dim2+1; d3<DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl,dd->cg_nalloc);
 +                    srenew(dd->cgindex,dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state,f,home_pos_at+nrcg);
 +                }
 +                for(i=0; i<nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +    
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm,home_pos_cg);
 +
 +        for(i=dd->ncg_home; i<home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved,dd->ncg_home-*ncg_moved);
 +                
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int i;
 +    double sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +    for(i=eNR_BONDS; i<=eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}  
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +    
 +    for(i=0; i<ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i] = 0;
 +        dd->comm->cycl_n[i] = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root=NULL;
 +    int  d,dim,cid,i,pos;
 +    float cell_frac=0,sbuf[DD_NLOAD_MAX];
 +    gmx_bool bSepPME;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle,ewcDDCOMMLOAD);
 +    
 +    comm = dd->comm;
 +    
 +    bSepPME = (dd->pme_nodeid >= 0);
 +    
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 || 
 +            (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
 +                       load->load,load->nload*sizeof(float),MPI_BYTE,
 +                       0,comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum = 0;
 +                load->max = 0;
 +                load->sum_m = 0;
 +                load->cvol_min = 1;
 +                load->flags = 0;
 +                load->mdf = 0;
 +                load->pme = 0;
 +                pos = 0;
 +                for(i=0; i<dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max = max(load->max,load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m,load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min,load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf,load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme,load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcDDCOMMLOAD);
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    char  buf[STRLEN];
 +    int   npp,npme,nnodes,d,limp;
 +    float imbal,pme_f_ratio,lossf,lossp=0;
 +    gmx_bool  bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal = comm->load_max*npp/comm->load_sum - 1;
 +        lossf = dd_force_imb_perf_loss(dd);
 +        sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"%s",buf);
 +        sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"%s",buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf),"\n");
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +            sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(stderr,"\n");
 +        
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n",lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
 +{
 +    int flags,d;
 +    char buf[22];
 +    
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog," %c",dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog,"  vol min/aver %5.3f%c",
 +                dd_vol_min(dd),flags ? '!' : ' ');
 +    }
 +    fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog,"\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr,"vol %4.2f%c ",
 +                dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
 +{
 +    MPI_Comm  c_row;
 +    int  dim, i, rank;
 +    ivec loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool bPartOfGroup = FALSE;
 +    
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc,loc_c);
 +    for(i=0; i<dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank = dd_index(dd->nc,loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind],1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
 +                snew(root->old_cell_f,dd->nc[dim]+1);
 +                snew(root->bCellMin,dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0,dd->nc[dim]);
 +                    snew(root->cell_f_min1,dd->nc[dim]);
 +                    snew(root->bound_min,dd->nc[dim]);
 +                    snew(root->bound_max,dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd,dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +  int  dim0,dim1,i,j;
 +  ivec loc;
 +
 +  if (debug)
 +    fprintf(debug,"Making load communicators\n");
 +
 +  snew(dd->comm->load,dd->ndim);
 +  snew(dd->comm->mpi_comm_load,dd->ndim);
 +  
 +  clear_ivec(loc);
 +  make_load_communicator(dd,0,loc);
 +  if (dd->ndim > 1) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      make_load_communicator(dd,1,loc);
 +    }
 +  }
 +  if (dd->ndim > 2) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      dim1 = dd->dim[1];
 +      for(j=0; j<dd->nc[dim1]; j++) {
 +        loc[dim1] = j;
 +        make_load_communicator(dd,2,loc);
 +      }
 +    }
 +  }
 +
 +  if (debug)
 +    fprintf(debug,"Finished making load communicators\n");
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    gmx_bool bZYX;
 +    int  d,dim,i,j,m;
 +    ivec tmp,s;
 +    int  nzone,nzonep;
 +    ivec dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
 +        if (debug)
 +        {
 +            fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank,dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +    
 +    if (DDMASTER(dd))
 +    {
 +        fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
 +          dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
 +                dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +    case 3:
 +        nzone  = dd_z3n;
 +        nzonep = dd_zp3n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp3[i],dd_zp[i]);
 +        }
 +        break;
 +    case 2:
 +        nzone  = dd_z2n;
 +        nzonep = dd_zp2n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp2[i],dd_zp[i]);
 +        }
 +        break;
 +    case 1:
 +        nzone  = dd_z1n;
 +        nzonep = dd_zp1n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp1[i],dd_zp[i]);
 +        }
 +        break;
 +    default:
 +        gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
 +        nzone = 0;
 +        nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for(i=0; i<nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +    
 +    zones->n = nzone;
 +    for(i=0; i<nzone; i++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for(i=0; i<zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
 +        }
 +        izone = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                  izone->shift0[d] = 0;
 +                  izone->shift1[d] = 0;
 +                  for(j=izone->j0; j<izone->j1; j++) {
 +                  if (dd->shift[j][d] > dd->shift[i][d])
 +                  izone->shift0[d] = -1;
 +                  if (dd->shift[j][d] < dd->shift[i][d])
 +                  izone->shift1[d] = 1;
 +                  }
 +                */
 +                
 +                int shift_diff;
 +                
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for(j=izone->j0; j<izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root,dd->ndim);
 +    }
 +    
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank,*buf;
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +    
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
 +    
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid,dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +        
 +        MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
 +        
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +        
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc,i,dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"The master rank is %d\n",dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc,dd->rank,dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +  
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t *dd;
 +    
 +    gmx_domdec_comm_t *comm;
 +    int  *buf;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg,int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int i;
 +
 +    snew(ma,1);
 +    
 +    snew(ma->ncg,dd->nnodes);
 +    snew(ma->index,dd->nnodes+1);
 +    snew(ma->cg,ncg);
 +    snew(ma->nat,dd->nnodes);
 +    snew(ma->ibuf,dd->nnodes*2);
 +    snew(ma->cell_x,DIM);
 +    for(i=0; i<DIM; i++)
 +    {
 +        snew(ma->cell_x[i],dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf,natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank;
 +    gmx_bool bDiv[DIM];
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (comm->bCartesianPP)
 +    {
 +        for(i=1; i<DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
 +                        &comm_cart);
 +        
 +        MPI_Comm_rank(comm_cart,&rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +        
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid = rank;
 +        
 +        MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
 +        
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +        
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot,dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +        case ddnoPP_PME:
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Order of the nodes: PP first, PME last\n");
 +            }
 +            break;
 +        case ddnoINTERLEAVE:
 +            /* Interleave the PP-only and PME-only nodes,
 +             * as on clusters with dual-core machines this will double
 +             * the communication bandwidth of the PME processes
 +             * and thus speed up the PP <-> PME and inter PME communication.
 +             */
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Interleaving PP and PME nodes\n");
 +            }
 +            comm->pmenodes = dd_pmenodes(cr);
 +            break;
 +        case ddnoCARTESIAN:
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
 +        }
 +    
 +        if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int CartReorder;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    copy_ivec(dd->nc,comm->ntot);
 +    
 +    comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +    
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +    
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog,cr,dd_node_order,CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI    
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +    
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog,cr,CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug,"My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid,dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
 +{
 +    real *slb_frac,tot;
 +    int  i,n;
 +    double dbl;
 +    
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac,nc);
 +        tot = 0;
 +        for (i=0; i<nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string,"%lf%n",&dbl,&n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
 +            }
 +            slb_frac[i] = dbl;
 +            size_string += n;
 +            tot += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Relative cell sizes:");
 +        }
 +        for (i=0; i<nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog," %5.3f",slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"\n");
 +        }
 +    }
 +    
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int n,nmol,ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *il;
 +    
 +    n = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
 +    {
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +  }
 +
 +  return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog,const char *env_var,int def)
 +{
 +    char *val;
 +    int  nst;
 +    
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val,"%d",&nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
 +                    env_var,val,nst);
 +        }
 +    }
 +    
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"\n%s\n",warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n%s\n",warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
 +                                  t_inputrec *ir,FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int  di,d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for(di=0; di<dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog,t_commrec *cr,
 +                             const char *dlb_opt,gmx_bool bRecordLoad,
 +                             unsigned long Flags,t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int  eDLB=-1;
 +    char buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +    case 'a': eDLB = edlbAUTO; break;
 +    case 'n': eDLB = edlbNO;   break;
 +    case 'y': eDLB = edlbYES;  break;
 +    default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
 +            dd_warning(cr,fplog,buf);
 +        }
 +            
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +                      case edlbNO: 
 +                              break;
 +                      case edlbAUTO:
 +                              dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                              eDLB = edlbNO;
 +                              break;
 +                      case edlbYES:
 +                              dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                              break;
 +                      default:
 +                              gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
 +                              break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using domain decomposition order z, y, x\n");
 +        }
 +        for(dim=DIM-1; dim>=0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  i;
 +
 +    snew(comm,1);
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +    for(i=0; i<DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +    
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for(i=0; i<ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload   = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min,real rconstr,
 +                                        const char *dlb_opt,real dlb_scale,
 +                                        const char *sizex,const char *sizey,const char *sizez,
 +                                        gmx_mtop_t *mtop,t_inputrec *ir,
 +                                        matrix box,rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x,int *npme_y)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  recload;
 +    int  d,i,j;
 +    real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
 +    gmx_bool bC;
 +    char buf[STRLEN];
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
 +    }
 +    
 +    snew(dd,1);
 +
 +    dd->comm = init_dd_comm();
 +    comm = dd->comm;
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +    
 +    dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
 +    comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
 +    recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
 +    comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
 +    comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
 +    comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +                             
 +    }
 +    
 +    comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
 +    
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump = comm->bDynLoadBal;
 +    
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog,"Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort,1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +    
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +    
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +    
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
 +                                      Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b),&r_2b,cr);
 +            gmx_bcast(sizeof(r_mb),&r_mb,cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b,r_mb) > comm->cutoff)
 +                {
 +                    r_bonded       = max(r_2b,r_mb);
 +                    r_bonded_limit = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b,r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog,mtop,ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc,dd->nc);
 +        set_dd_dim(fplog,dd);
 +        set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd,ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs,comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
 +                               comm->eDLB!=edlbNO,dlb_scale,
 +                               comm->cellsize_limit,comm->cutoff,
 +                               comm->bInterCGBondeds,comm->bInterCGMultiBody);
 +        
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB!=edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes,limit,buf);
 +        }
 +        set_dd_dim(fplog,dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
 +    }
 +    
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }    
 +        if (fplog)
 +        {
 +            fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x,comm->npmenodes_y,1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +    
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +        
 +    snew(comm->slb_frac,DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs = average_cellsize_min(dd,ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm,comm->cellsize_limit);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr,dd,ir,fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    real cellsize_min;
 +    int  d,nc,i;
 +    char buf[STRLEN];
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump = TRUE;
 +    
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for(i=0; i<nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int  ncg,cg;
 +    char *bLocalCG;
 +    
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG,ncg);
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,gmx_constr_t constr,
 +                     t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bBondComm;
 +    int  d;
 +
 +    dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal,real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec np;
 +    real limit,shrink;
 +    char buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog,"The maximum number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
 +        fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
 +        fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
 +        for(d=0; d<DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog," %c %.2f",dim2char(d),shrink);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
 +        fprintf(fplog,"The initial number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The initial domain decomposition cell size is:");
 +        for(d=0; d<DIM; d++) {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog," %c %.2f nm",
 +                        dim2char(d),dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog,"\n\n");
 +    }
 +    
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions","",comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox,ir))
 +            {
 +                fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for(d=1; d<DIM; d++)
 +            {
 +                limit = min(limit,dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions","(-rdd)",
 +                    max(comm->cutoff,comm->cutoff_mbody));
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions","(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions","(-rcon)",limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf,"atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    buf,"(-rcon)",limit);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t *dd,
 +                                real dlb_scale,
 +                                const t_inputrec *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim,npulse,npulse_d_max,npulse_d;
 +    gmx_bool bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +        
 +    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +        
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max,npulse_d);
 +        }
 +        npulse = min(npulse,npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX]>1 &&
 +              dd->nc[YY]>1 &&
 +              (dd->nc[ZZ]>1 || ePBC==epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
 +                       t_inputrec *ir,t_forcerec *fr,
 +                       gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  natoms_tot;
 +    real vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth,comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd,&comm->ddpme[0],0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd,&comm->ddpme[1],1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +        
 +    if (debug)
 +    {
 +        fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
 +    }
 +    
 +    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +   
 +    dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t ddbox;
 +    int d,dim,np;
 +    real inv_cell_size;
 +    int LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd,FALSE,cr,ir,state->box,
 +              TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox,ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        if (check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
 +        {
 +            LocallyLimited = 1; 
 +        }
 +
 +        gmx_sumi(1,&LocallyLimited,cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    dd->comm->cutoff = cutoff_req;
 +
 +    return TRUE;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb,int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind,*ind_p;
 +    int p,cell,c,cg,cg0,cg1,cg_gl,nat;
 +    int shift,shift_at;
 +    
 +    ind = &cd->ind[pulse];
 +    
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for(cell=ncell-1; cell>=0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0 = ncg_cell[ncell+cell];
 +            cg1 = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for(cg=cg1-1; cg>=cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift] = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for(p=1; p<=pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0 = 0;
 +                for(c=0; c<cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for(cg=cg0; cg<cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift = 0;
 +    shift_at = 0;
 +    cg0 = 0;
 +    for(cell=0; cell<ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for(cg=0; cg<ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0],cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl = index_gl[cg1];
 +            cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
 +            nat = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone,int cg0,const int *cgindex)
 +{
 +    int cg,zone,p;
 +    
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        for(p=0; p<cd->np; p++) {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
 +{
 +    int  i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i,j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +        
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for(j=0; j<4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for(i=0; i<zones->nizone; i++)
 +                {
 +                    for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for(i=0; i<2; i++)
 +                    {
 +                        for(j=0; j<2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bScrew;
 +    gmx_bool bDistMB_pulse;
 +    int  cg,i;
 +    real r2,rb2,r,tric_sh;
 +    rvec rn,rb;
 +    int  dimd;
 +    int  nsend_z,nsend,nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for(cg=cg0; cg<cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for(i=dim0+1; i<DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2 = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for(i=1; i<=dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh = 0;
 +                for(i=dim1+1; i<DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh = 0;
 +            for(i=dim+1; i<DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +        
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink,index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index,ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf,*ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend] = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf,nsend+1);
 +            
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg],vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box,gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr,t_state *state,rvec **f)
 +{
 +    int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
 +    int nzone,nzone_send,zone,zonei,cg0,cg1;
 +    int c,i,j,cg,cg_gl,nrcg;
 +    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
 +    real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
 +    dd_corners_t corners;
 +    ivec tric_dist;
 +    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
 +    real skew_fac2_d,skew_fac_01;
 +    rvec sf2_round;
 +    int  nsend,nat;
 +    int  th;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Setting up DD communication\n");
 +    }
 +    
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        cg_cm = fr->cg_cm;
 +        break;
 +    case ecutsVERLET:
 +        cg_cm = state->x;
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        cg_cm = NULL;
 +    }
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for(i=0; i<=dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +    
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +    
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
 +    
 +    /* Triclinic stuff */
 +    normal = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +    
 +    zone_cg_range = zones->cg_range;
 +    index_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    cginfo_mb = fr->cginfo_mb;
 +    
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +    
 +    nat_tot = dd->nat_home;
 +    nzone = 1;
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd = &comm->cd[dim_ind];
 +        
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for(p=0; p<cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind = &cd->ind[p];
 +            nsend = 0;
 +            nat = 0;
 +            for(zone=0; zone<nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for(dimd=0; dimd<dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for(i=dd->dim[dimd]+1; i<DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for(th=0; th<comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int **ibuf_p,*ibuf_nalloc_p;
 +                    vec_rvec_t *vbuf_p;
 +                    int *nsend_p,*nat_p;
 +                    int *nsend_zone_p;
 +                    int cg0_th,cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +                    
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
 +                                       index_gl,cgindex,
 +                                       dim,dim_ind,dim0,dim1,dim2,
 +                                       r_comm2,r_bcomm2,
 +                                       box,tric_dist,
 +                                       normal,skew_fac2_d,skew_fac_01,
 +                                       v_d,v_0,v_1,&corners,sf2_round,
 +                                       bDistBonded,bBondComm,
 +                                       bDist2B,bDistMB,
 +                                       cg_cm,fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p,ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p,nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for(th=1; th<comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int i,ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index,ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int,comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v,comm->vbuf.nalloc);
 +                    }
 +
 +                    for(i=0; i<dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend] = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for(zone=nzone_send; zone<nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +            
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for(zone=0; zone<nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2,comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2,i);
 +                }
 +            }
 +            
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl,dd->cg_nalloc);
 +                srenew(cgindex,dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +            
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for(cg=0; cg<ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
 +                        nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone,cd,p,zone_cg_range,
 +                                 index_gl,recv_i,cg_cm,recv_vr,
 +                                 cgindex,fr->cginfo_mb,fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +    
 +    dd->ncg_tot = zone_cg_range[zones->n];
 +    dd->nat_tot = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for(i=ddnatZONE; i<ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
 +                      NULL,comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Finished setting up DD communication, zones:");
 +        for(c=0; c<zones->n; c++)
 +        {
 +            fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +    
 +    for(c=0; c<zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box,const gmx_ddbox_t *ddbox,
 +                           int zone_start,int zone_end)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool bDistMB;
 +    int  z,zi,zj0,zj1,d,dim;
 +    real rcs,rcmbs;
 +    int  i,j;
 +    real size_j,add_tric;
 +    real vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for(z=zone_start; z<zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0,zones->size[z].x0);
 +        copy_rvec(comm->cell_x1,zones->size[z].x1);
 +    }
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for(z=0; z<zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for(z=zone_start; z<zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for(zi=0; zi<zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for(zi=0; zi<zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for(z=zone_start; z<zone_end; z++)
 +    {
 +        for(i=0; i<DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = zones->size[z].x0[i];
 +            zones->size[z].bb_x1[i] = zones->size[z].x1[i];
 +
 +            for(j=i+1; j<ddbox->npbcdim; j++)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but trilinic x-y and rectangular y-z.
 +                 */
 +                if (box[j][i] != 0 &&
 +                    !(dd->ndim == 1 && i == YY && j == ZZ))
 +                {
 +                    /* Correct for triclinic offset of the lower corner */
 +                    add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
 +                    zones->size[z].bb_x0[i] += add_tric;
 +                    zones->size[z].bb_x1[i] += add_tric;
 +
 +                    /* Correct for triclinic offset of the upper corner */
 +                    size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
 +                    add_tric = size_j*box[j][i]/box[j][j];
 +
 +                    if (box[j][i] < 0)
 +                    {
 +                        zones->size[z].bb_x0[i] += add_tric;
 +                    }
 +                    else
 +                    {
 +                        zones->size[z].bb_x1[i] += add_tric;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for(z=zone_start; z<zone_end; z++)
 +        {
 +            fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX],zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY],zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
 +            fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a,const void *b)
 +{
 +    int comp;
 +    
 +    gmx_cgsort_t *cga,*cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +    
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +    
 +    return comp;
 +}
 +
 +static void order_int_cg(int n,const gmx_cgsort_t *sort,
 +                         int *a,int *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n,const gmx_cgsort_t *sort,
 +                         rvec *v,rvec *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind],buf[i]);
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(buf[i],v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
 +                           rvec *v,rvec *buf)
 +{
 +    int a,atot,cg,cg0,cg1,i;
 +    
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg,sort,v,buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for(i=cg0; i<cg1; i++)
 +        {
 +            copy_rvec(v[i],buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +    
 +    /* Copy back to the original array */
 +    for(a=0; a<atot; a++)
 +    {
 +        copy_rvec(buf[a],v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
 +                         int nsort_new,gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1,i2,i_new;
 +    
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
 +    
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1 = 0;
 +    i2 = 0;
 +    i_new = 0;
 +    while(i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
 +    int  sort_last,sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new = 0;
 +        nsort2 = 0;
 +        nsort_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new,sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2,nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort = sort->sort;
 +        ncg_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int  ncg_new,i,*a,na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
 +
 +    ncg_new = 0;
 +    for(i=0; i<na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
 +                          rvec *cgcm,t_forcerec *fr,t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  *cgindex;
 +    int  ncg_new,i,*ibuf,cgsize;
 +    rvec *vbuf;
 +    
 +    sort = dd->comm->sort;
 +    
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort,sort->sort_nalloc);
 +        srenew(sort->sort2,sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        ncg_new = dd_sort_order(dd,fr,ncg_home_old);
 +        break;
 +    case ecutsVERLET:
 +        ncg_new = dd_sort_order_nbnxn(dd,fr);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +    
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug,"Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +    
 +    /* Reorder the state */
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +            case estX:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
 +                break;
 +            case estV:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
 +                break;
 +            case estSDX:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
 +                break;
 +            case estCGP:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No ordering required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
 +    }
 +    
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf,sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for(i=0; i<dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for(i=0; i<dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +    
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload = 0;
 +    comm->load_step = 0;
 +    comm->load_sum = 0;
 +    comm->load_max = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    double av;
 +   
 +    comm = cr->dd->comm;
 +    
 +    gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
 +    
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +    
 +    fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +            
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch(ddnat)
 +        {
 +        case ddnatZONE:
 +            fprintf(fplog,
 +                    " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                    2,av);
 +            break;
 +        case ddnatVSITE:
 +            if (cr->dd->vsite_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                        (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
 +                        av);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (cr->dd->constraint_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                        1 + ir->nLincsIter,av);
 +            }
 +            break;
 +        default:
 +            gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog,"\n");
 +    
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog,cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE            *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec       *cr,
 +                         gmx_bool            bMasterState,
 +                         int             nstglobalcomm,
 +                         t_state         *state_global,
 +                         gmx_mtop_t      *top_global,
 +                         t_inputrec      *ir,
 +                         t_state         *state_local,
 +                         rvec            **f,
 +                         t_mdatoms       *mdatoms,
 +                         gmx_localtop_t  *top_local,
 +                         t_forcerec      *fr,
 +                         gmx_vsite_t     *vsite,
 +                         gmx_shellfc_t   shellfc,
 +                         gmx_constr_t    constr,
 +                         t_nrnb          *nrnb,
 +                         gmx_wallcycle_t wcycle,
 +                         gmx_bool            bVerbose)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t ddbox={0};
 +    t_block *cgs_gl;
 +    gmx_large_int_t step_pcoupl;
 +    rvec cell_ns_x0,cell_ns_x1;
 +    int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
 +    gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
 +    gmx_bool bRedist,bSortCG,bResortAll;
 +    ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
 +    real grid_density;
 +    char sbuf[22];
 +      
 +    dd = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n = max(100,nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +        
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd,wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog,dd,step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB) {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug,"step %s, imb loss %f\n",
 +                                gmx_step_str(step,sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog,cr,step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_global->box,
 +                  TRUE,cgs_gl,state_global->x,&ddbox);
 +    
 +        get_cg_distribution(fplog,step,dd,cgs_gl,
 +                            state_global->box,&ddbox,state_global->x);
 +        
 +        dd_distribute_state(dd,cgs_gl,
 +                            state_global,state_local,f);
 +        
 +        dd_make_local_cgs(dd,&top_local->cgs);
 +        
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog,0,dd->ncg_home,
 +                      &top_local->cgs,state_local->x,fr->cg_cm);
 +        }
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +        
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        cg0 = 0;
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
 +        }
 +        
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
 +        }
 +        
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +        
 +        /* Build the new indices */
 +        rebuild_cgindex(dd,cgs_gl->index,state_local);
 +        make_dd_indices(dd,cgs_gl->index,0);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog,0,dd->ncg_home,
 +                      &top_local->cgs,state_local->x,fr->cg_cm);
 +        }
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  TRUE,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0    ,ddbox.box0    );
 +            copy_rvec(comm->box_size,ddbox.box_size);
 +        }
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0    ,comm->box0    );
 +    copy_rvec(ddbox.box_size,comm->box_size);
 +    
 +    set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
 +                      step,wcycle);
 +    
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
 +    }
 +    
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle,ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
 +                           state_local,f,fr,mdatoms,
 +                           !bSortCG,nrnb,&cg0,&ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
 +    }
 +    
 +    get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
 +                          dd,&ddbox,
 +                          &comm->cell_x0,&comm->cell_x1,
 +                          dd->ncg_home,fr->cg_cm,
 +                          cell_ns_x0,cell_ns_x1,&grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        copy_ivec(fr->ns.grid->n,ncells_old);
 +        grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
 +                   state_local->box,cell_ns_x0,cell_ns_x1,
 +                   fr->rlistlong,grid_density);
 +        break;
 +    case ecutsVERLET:
 +        nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir,comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle,ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +        
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +        case ecutsVERLET:
 +            set_zones_size(dd,state_local->box,&ddbox,0,1);
 +
 +            nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
 +                              0,
 +                              comm->zones.size[0].bb_x0,
 +                              comm->zones.size[0].bb_x1,
 +                              0,dd->ncg_home,
 +                              comm->zones.dens_zone0,
 +                              fr->cginfo,
 +                              state_local->x,
 +                              ncg_moved,comm->moved,
 +                              fr->nbv->grp[eintLocal].kernel_type,
 +                              fr->nbv->grp[eintLocal].nbat);
 +
 +            nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
 +            break;
 +        case ecutsGROUP:
 +            fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
 +                      0,dd->ncg_home,fr->cg_cm);
 +            
 +            copy_ivec(fr->ns.grid->n,ncells_new);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +   
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step,sbuf),dd->ncg_home);
 +        }
 +        dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        cg0 = 0;
 +        ga2la_clear(dd->ga2la);
 +
 +        wallcycle_sub_stop(wcycle,ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
 +    
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
 +    
 +    /* Set the indices */
 +    make_dd_indices(dd,cgs_gl->index,cg0);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd,state_local->box,&ddbox,
 +                       bSortCG ? 1 : 0,comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
 +
 +    /*
 +    write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +    */
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
 +    
 +    /* Extract a local topology from the global topology */
 +    for(i=0; i<dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
 +                      comm->cellsize_min,np,
 +                      fr,
 +                      fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite,top_global,top_local);
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
 +    
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for(i=ddnatZONE+1; i<ddnatNR; i++)
 +    {
 +        switch(i)
 +        {
 +        case ddnatVSITE:
 +            if (vsite && vsite->n_intercg_vsite)
 +            {
 +                n = dd_make_local_vsites(dd,n,top_local->idef.il);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (dd->bInterCGcons || dd->bInterCGsettles)
 +            {
 +                /* Only for inter-cg constraints we need special code */
 +                n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
 +                                              constr,ir->nProjOrder,
 +                                              top_local->idef.il);
 +            }
 +            break;
 +        default:
 +            gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
 +                        dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global,ir,
 +             comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd,mdatoms,top_local);
 +
++    if (vsite != NULL)
++    {
++        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
++        split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
++    }
++
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr,mdatoms,shellfc);
 +    }
 +    
 +      if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr,fr->born,ir->gb_algorithm);
 +    }
 +
 +    init_bonded_thread_force_reduction(fr,&top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA,mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
 +    }
 +    
 +    if (constr)
 +    {
 +        set_constraints(constr,top_local,ir,mdatoms,cr);
 +    }
 +    
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd,ir->pull,mdatoms);
 +    }
 +    
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd,ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +    
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +    
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd,state_local->box,state_local->x);
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
 +    
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd,state_local->box,state_local->x);
 +        write_dd_pdb("dd_dump",step,"dump",top_global,cr,
 +                     -1,state_local->x,state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +    
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index 9af136828fe5d8c4b3b17e5744d10d61741276f1,0000000000000000000000000000000000000000..4c82c89d4875029b63bae5608f1ad7b51515f6d3
mode 100644,000000..100644
--- /dev/null
@@@ -1,2385 -1,0 +1,2385 @@@
-             if (vsite && vsite->n_intercg_vsite > 0)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "names.h"
 +#include "network.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "gmx_random.h"
 +#include "topsort.h"
 +#include "mtop_util.h"
 +#include "mshift.h"
 +#include "vsite.h"
 +#include "gmx_ga2la.h"
 +#include "force.h"
 +#include "gmx_omp_nthreads.h"
 +
 +/* for dd_init_local_state */
 +#define NITEM_DD_INIT_LOCAL_STATE 5
 +
 +typedef struct {
 +    int  *index;  /* Index for each atom into il                  */ 
 +    int  *il;     /* ftype|type|a0|...|an|ftype|...               */
 +} gmx_reverse_ilist_t;
 +
 +typedef struct {
 +    int  a_start;
 +    int  a_end;
 +    int  natoms_mol;
 +    int  type;
 +} gmx_molblock_ind_t;
 +
 +typedef struct gmx_reverse_top {
 +    gmx_bool bExclRequired; /* Do we require all exclusions to be assigned? */
 +    gmx_bool bConstr;       /* Are there constraints in this revserse top?  */
 +    gmx_bool bSettle;       /* Are there settles in this revserse top?  */
 +    gmx_bool bBCheck;       /* All bonded interactions have to be assigned? */
 +    gmx_bool bMultiCGmols;  /* Are the multi charge-group molecules?        */
 +    gmx_reverse_ilist_t *ril_mt; /* Reverse ilist for all moltypes      */
 +    int  ril_mt_tot_size;
 +    int  ilsort;        /* The sorting state of bondeds for free energy */
 +    gmx_molblock_ind_t *mbi;
 +    int nmolblock;
 +
 +    /* Work data structures for multi-threading */
 +    int      nthread;
 +    t_idef   *idef_thread;
 +    int      ***vsite_pbc;
 +    int      **vsite_pbc_nalloc;
 +    int      *nbonded_thread;
 +    t_blocka *excl_thread;
 +    int      *excl_count_thread;
 +    
 +    /* Pointers only used for an error message */
 +    gmx_mtop_t     *err_top_global;
 +    gmx_localtop_t *err_top_local;
 +} gmx_reverse_top_t;
 +
 +static int nral_rt(int ftype)
 +{
 +    /* Returns the number of atom entries for il in gmx_reverse_top_t */
 +    int nral;
 +    
 +    nral = NRAL(ftype);
 +    if (interaction_function[ftype].flags & IF_VSITE)
 +    {
 +        /* With vsites the reverse topology contains
 +         * two extra entries for PBC.
 +         */
 +        nral += 2;
 +    }
 +    
 +    return nral;
 +}
 +
 +/* This function tells which interactions need to be assigned exactly once */
 +static gmx_bool dd_check_ftype(int ftype,gmx_bool bBCheck,
 +                               gmx_bool bConstr,gmx_bool bSettle)
 +{
 +    return (((interaction_function[ftype].flags & IF_BOND) &&
 +             !(interaction_function[ftype].flags & IF_VSITE) &&
 +             (bBCheck || !(interaction_function[ftype].flags & IF_LIMZERO))) ||
 +            (bConstr && (ftype == F_CONSTR || ftype == F_CONSTRNC)) ||
 +            (bSettle && ftype == F_SETTLE));
 +}
 +
 +static void print_error_header(FILE *fplog,char *moltypename,int nprint)
 +{
 +    fprintf(fplog, "\nMolecule type '%s'\n",moltypename);
 +    fprintf(stderr,"\nMolecule type '%s'\n",moltypename);
 +    fprintf(fplog,
 +            "the first %d missing interactions, except for exclusions:\n",
 +            nprint);
 +    fprintf(stderr,
 +            "the first %d missing interactions, except for exclusions:\n",
 +            nprint);
 +}
 +
 +static void print_missing_interactions_mb(FILE *fplog,t_commrec *cr,
 +                                          gmx_reverse_top_t *rt,
 +                                          char *moltypename,
 +                                          gmx_reverse_ilist_t *ril,
 +                                          int a_start,int a_end,
 +                                          int nat_mol,int nmol,
 +                                          t_idef *idef)
 +{
 +    int nril_mol,*assigned,*gatindex;
 +    int ftype,ftype_j,nral,i,j_mol,j,k,a0,a0_mol,mol,a,a_gl;
 +    int nprint;
 +    t_ilist *il;
 +    t_iatom *ia;
 +    gmx_bool bFound;
 +    
 +    nril_mol = ril->index[nat_mol];
 +    snew(assigned,nmol*nril_mol);
 +    
 +    gatindex = cr->dd->gatindex;
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        if (dd_check_ftype(ftype,rt->bBCheck,rt->bConstr,rt->bSettle))
 +        {
 +            nral = NRAL(ftype);
 +            il = &idef->il[ftype];
 +            ia = il->iatoms;
 +            for(i=0; i<il->nr; i+=1+nral)
 +            {
 +                a0     = gatindex[ia[1]];
 +                /* Check if this interaction is in
 +                 * the currently checked molblock.
 +                 */
 +                if (a0 >= a_start && a0 < a_end)
 +                {
 +                    mol    = (a0 - a_start)/nat_mol;
 +                    a0_mol = (a0 - a_start) - mol*nat_mol;
 +                    j_mol  = ril->index[a0_mol];
 +                    bFound = FALSE;
 +                    while (j_mol < ril->index[a0_mol+1] && !bFound)
 +                    {
 +                        j = mol*nril_mol + j_mol;
 +                        ftype_j = ril->il[j_mol];
 +                        /* Here we need to check if this interaction has
 +                         * not already been assigned, since we could have
 +                         * multiply defined interactions.
 +                         */
 +                        if (ftype == ftype_j && ia[0] == ril->il[j_mol+1] &&
 +                            assigned[j] == 0)
 +                        {
 +                            /* Check the atoms */
 +                            bFound = TRUE;
 +                            for(a=0; a<nral; a++)
 +                            {
 +                                if (gatindex[ia[1+a]] !=
 +                                    a_start + mol*nat_mol + ril->il[j_mol+2+a])
 +                                {
 +                                    bFound = FALSE;
 +                                }
 +                            }
 +                            if (bFound)
 +                            {
 +                                assigned[j] = 1;
 +                            }
 +                        }
 +                        j_mol += 2 + nral_rt(ftype_j);
 +                    }
 +                    if (!bFound)
 +                    {
 +                        gmx_incons("Some interactions seem to be assigned multiple times");
 +                    }
 +                }
 +                ia += 1 + nral;
 +            }
 +        }
 +    }
 +    
 +    gmx_sumi(nmol*nril_mol,assigned,cr);
 +    
 +    nprint = 10;
 +    i = 0;
 +    for(mol=0; mol<nmol; mol++)
 +    {
 +        j_mol = 0;
 +        while (j_mol < nril_mol)
 +        {
 +            ftype = ril->il[j_mol];
 +            nral  = NRAL(ftype);
 +            j = mol*nril_mol + j_mol;
 +            if (assigned[j] == 0 &&
 +                !(interaction_function[ftype].flags & IF_VSITE))
 +            {
 +                if (DDMASTER(cr->dd))
 +                {
 +                    if (i == 0)
 +                    {
 +                        print_error_header(fplog,moltypename,nprint);
 +                    }
 +                    fprintf(fplog, "%20s atoms",
 +                            interaction_function[ftype].longname);
 +                    fprintf(stderr,"%20s atoms",
 +                            interaction_function[ftype].longname);
 +                    for(a=0; a<nral; a++) {
 +                        fprintf(fplog, "%5d",ril->il[j_mol+2+a]+1);
 +                        fprintf(stderr,"%5d",ril->il[j_mol+2+a]+1);
 +                    }
 +                    while (a < 4)
 +                    {
 +                        fprintf(fplog, "     ");
 +                        fprintf(stderr,"     ");
 +                        a++;
 +                    }
 +                    fprintf(fplog, " global");
 +                    fprintf(stderr," global");
 +                    for(a=0; a<nral; a++)
 +                    {
 +                        fprintf(fplog, "%6d",
 +                                a_start+mol*nat_mol+ril->il[j_mol+2+a]+1);
 +                        fprintf(stderr,"%6d",
 +                                a_start+mol*nat_mol+ril->il[j_mol+2+a]+1);
 +                    }
 +                    fprintf(fplog, "\n");
 +                    fprintf(stderr,"\n");
 +                }
 +                i++;
 +                if (i >= nprint)
 +                {
 +                    break;
 +                }
 +            }
 +            j_mol += 2 + nral_rt(ftype);
 +        }
 +    }
 +    
 +    sfree(assigned);    
 +}
 +
 +static void print_missing_interactions_atoms(FILE *fplog,t_commrec *cr,
 +                                             gmx_mtop_t *mtop,t_idef *idef)
 +{
 +    int mb,a_start,a_end;
 +    gmx_molblock_t *molb;
 +    gmx_reverse_top_t *rt;
 +    
 +    rt = cr->dd->reverse_top;
 +    
 +    /* Print the atoms in the missing interactions per molblock */
 +    a_end = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        a_start = a_end;
 +        a_end   = a_start + molb->nmol*molb->natoms_mol;
 +        
 +        print_missing_interactions_mb(fplog,cr,rt,
 +                                      *(mtop->moltype[molb->type].name),
 +                                      &rt->ril_mt[molb->type],
 +                                      a_start,a_end,molb->natoms_mol,
 +                                      molb->nmol,
 +                                      idef);
 +    }
 +}
 +
 +void dd_print_missing_interactions(FILE *fplog,t_commrec *cr,int local_count,  gmx_mtop_t *top_global, t_state *state_local)
 +{
 +    int  ndiff_tot,cl[F_NRE],n,ndiff,rest_global,rest_local;
 +    int  ftype,nral;
 +    char buf[STRLEN];
 +    gmx_domdec_t *dd;
 +    gmx_mtop_t     *err_top_global;
 +    gmx_localtop_t *err_top_local;
 +    
 +    dd = cr->dd;
 +
 +    err_top_global = dd->reverse_top->err_top_global;
 +    err_top_local  = dd->reverse_top->err_top_local;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nNot all bonded interactions have been properly assigned to the domain decomposition cells\n");
 +        fflush(fplog);
 +    }
 +    
 +    ndiff_tot = local_count - dd->nbonded_global;
 +    
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        nral = NRAL(ftype);
 +        cl[ftype] = err_top_local->idef.il[ftype].nr/(1+nral);
 +    }
 +    
 +    gmx_sumi(F_NRE,cl,cr);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        fprintf(fplog,"\nA list of missing interactions:\n");
 +        fprintf(stderr,"\nA list of missing interactions:\n");
 +        rest_global = dd->nbonded_global;
 +        rest_local  = local_count;
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            /* In the reverse and local top all constraints are merged
 +             * into F_CONSTR. So in the if statement we skip F_CONSTRNC
 +             * and add these constraints when doing F_CONSTR.
 +             */
 +            if (((interaction_function[ftype].flags & IF_BOND) &&
 +                 (dd->reverse_top->bBCheck 
 +                  || !(interaction_function[ftype].flags & IF_LIMZERO)))
 +                || (dd->reverse_top->bConstr && ftype == F_CONSTR)
 +                || (dd->reverse_top->bSettle && ftype == F_SETTLE))
 +            {
 +                nral = NRAL(ftype);
 +                n = gmx_mtop_ftype_count(err_top_global,ftype);
 +                if (ftype == F_CONSTR)
 +                {
 +                    n += gmx_mtop_ftype_count(err_top_global,F_CONSTRNC);
 +                }
 +                ndiff = cl[ftype] - n;
 +                if (ndiff != 0)
 +                {
 +                    sprintf(buf,"%20s of %6d missing %6d",
 +                            interaction_function[ftype].longname,n,-ndiff);
 +                    fprintf(fplog,"%s\n",buf);
 +                    fprintf(stderr,"%s\n",buf);
 +                }
 +                rest_global -= n;
 +                rest_local  -= cl[ftype];
 +            }
 +        }
 +        
 +        ndiff = rest_local - rest_global;
 +        if (ndiff != 0)
 +        {
 +            sprintf(buf,"%20s of %6d missing %6d","exclusions",
 +                    rest_global,-ndiff);
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +    }
 +    
 +    print_missing_interactions_atoms(fplog,cr,err_top_global,
 +                                     &err_top_local->idef);
 +    write_dd_pdb("dd_dump_err",0,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +    if (DDMASTER(dd))
 +    {
 +        if (ndiff_tot > 0)
 +        {
 +            gmx_incons("One or more interactions were multiple assigned in the domain decompostion");
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS,"%d of the %d bonded interactions could not be calculated because some atoms involved moved further apart than the multi-body cut-off distance (%g nm) or the two-body cut-off distance (%g nm), see option -rdd, for pairs and tabulated bonds also see option -ddcheck",-ndiff_tot,cr->dd->nbonded_global,dd_cutoff_mbody(cr->dd),dd_cutoff_twobody(cr->dd));
 +        }
 +    }
 +}
 +
 +static void global_atomnr_to_moltype_ind(gmx_reverse_top_t *rt,int i_gl,
 +                                       int *mb,int *mt,int *mol,int *i_mol)
 +{
 +  int molb;
 +
 +
 +  gmx_molblock_ind_t *mbi = rt->mbi;
 +  int start = 0;
 +  int end =  rt->nmolblock; /* exclusive */
 +  int mid;
 +
 +  /* binary search for molblock_ind */
 +  while (TRUE) {
 +      mid = (start+end)/2;
 +      if (i_gl >= mbi[mid].a_end)
 +      {
 +          start = mid+1;
 +      }
 +      else if (i_gl < mbi[mid].a_start)
 +      {
 +          end = mid;
 +      }
 +      else
 +      {
 +          break;
 +      }
 +  }
 +
 +  *mb = mid;
 +  mbi += mid;
 +
 +  *mt    = mbi->type;
 +  *mol   = (i_gl - mbi->a_start) / mbi->natoms_mol;
 +  *i_mol = (i_gl - mbi->a_start) - (*mol)*mbi->natoms_mol;
 +}
 +
 +static int count_excls(t_block *cgs,t_blocka *excls,int *n_intercg_excl)
 +{
 +    int n,n_inter,cg,at0,at1,at,excl,atj;
 +    
 +    n = 0;
 +    *n_intercg_excl = 0;
 +    for(cg=0; cg<cgs->nr; cg++)
 +    {
 +        at0 = cgs->index[cg];
 +        at1 = cgs->index[cg+1];
 +        for(at=at0; at<at1; at++)
 +        {
 +            for(excl=excls->index[at]; excl<excls->index[at+1]; excl++)
 +            {
 +                atj = excls->a[excl];
 +                if (atj > at)
 +                {
 +                    n++;
 +                    if (atj < at0 || atj >= at1)
 +                    {
 +                        (*n_intercg_excl)++;
 +                    }
 +                }
 +            }
 +        }
 +    }  
 +    
 +    return n;
 +}
 +
 +static int low_make_reverse_ilist(t_ilist *il_mt,t_atom *atom,
 +                                  int **vsite_pbc,
 +                                  int *count,
 +                                  gmx_bool bConstr,gmx_bool bSettle,
 +                                  gmx_bool bBCheck,
 +                                  int *r_index,int *r_il,
 +                                  gmx_bool bLinkToAllAtoms,
 +                                  gmx_bool bAssign)
 +{
 +    int  ftype,nral,i,j,nlink,link;
 +    t_ilist *il;
 +    t_iatom *ia;
 +    atom_id a;
 +    int  nint;
 +    gmx_bool bVSite;
 +    
 +    nint = 0;
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        if ((interaction_function[ftype].flags & (IF_BOND | IF_VSITE)) ||
 +            (bConstr && (ftype == F_CONSTR || ftype == F_CONSTRNC)) ||
 +            (bSettle && ftype == F_SETTLE))
 +        {
 +            bVSite = (interaction_function[ftype].flags & IF_VSITE);
 +            nral = NRAL(ftype);
 +            il = &il_mt[ftype];
 +            ia  = il->iatoms;
 +            for(i=0; i<il->nr; i+=1+nral)
 +            {
 +                ia = il->iatoms + i;
 +                if (bLinkToAllAtoms)
 +                {
 +                    if (bVSite)
 +                    {
 +                        /* We don't need the virtual sites for the cg-links */
 +                        nlink = 0;
 +                    }
 +                    else
 +                    {
 +                        nlink = nral;
 +                    }
 +                }
 +                else
 +                {
 +                    /* Couple to the first atom in the interaction */
 +                    nlink = 1;
 +                }
 +                for(link=0; link<nlink; link++)
 +                {
 +                    a = ia[1+link];
 +                    if (bAssign)
 +                    {
 +                        r_il[r_index[a]+count[a]] =
 +                            (ftype == F_CONSTRNC ? F_CONSTR : ftype);
 +                        r_il[r_index[a]+count[a]+1] = ia[0];
 +                        for(j=1; j<1+nral; j++)
 +                        {
 +                            /* Store the molecular atom number */
 +                            r_il[r_index[a]+count[a]+1+j] = ia[j];
 +                        }
 +                    }
 +                    if (interaction_function[ftype].flags & IF_VSITE)
 +                    {
 +                        if (bAssign)
 +                        {
 +                            /* Add an entry to iatoms for storing 
 +                             * which of the constructing atoms are
 +                             * vsites again.
 +                             */
 +                            r_il[r_index[a]+count[a]+2+nral] = 0;
 +                            for(j=2; j<1+nral; j++)
 +                            {
 +                                if (atom[ia[j]].ptype == eptVSite)
 +                                {
 +                                    r_il[r_index[a]+count[a]+2+nral] |= (2<<j);
 +                                }
 +                            }
 +                            /* Store vsite pbc atom in a second extra entry */
 +                            r_il[r_index[a]+count[a]+2+nral+1] =
 +                                (vsite_pbc ? vsite_pbc[ftype-F_VSITE2][i/(1+nral)] : -2);
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* We do not count vsites since they are always
 +                         * uniquely assigned and can be assigned
 +                         * to multiple nodes with recursive vsites.
 +                         */
 +                        if (bBCheck ||
 +                            !(interaction_function[ftype].flags & IF_LIMZERO))
 +                        {
 +                            nint++;
 +                        }
 +                    }
 +                    count[a] += 2 + nral_rt(ftype);
 +                }
 +            }
 +        }
 +    }
 +    
 +    return nint;
 +}
 +
 +static int make_reverse_ilist(gmx_moltype_t *molt,
 +                              int **vsite_pbc,
 +                              gmx_bool bConstr,gmx_bool bSettle,
 +                              gmx_bool bBCheck,
 +                              gmx_bool bLinkToAllAtoms,
 +                              gmx_reverse_ilist_t *ril_mt)
 +{
 +    int nat_mt,*count,i,nint_mt;
 +    
 +    /* Count the interactions */
 +    nat_mt = molt->atoms.nr;
 +    snew(count,nat_mt);
 +    low_make_reverse_ilist(molt->ilist,molt->atoms.atom,vsite_pbc,
 +                           count,
 +                           bConstr,bSettle,bBCheck,NULL,NULL,
 +                           bLinkToAllAtoms,FALSE);
 +    
 +    snew(ril_mt->index,nat_mt+1);
 +    ril_mt->index[0] = 0;
 +    for(i=0; i<nat_mt; i++)
 +    {
 +        ril_mt->index[i+1] = ril_mt->index[i] + count[i];
 +        count[i] = 0;
 +    }
 +    snew(ril_mt->il,ril_mt->index[nat_mt]);
 +    
 +    /* Store the interactions */
 +    nint_mt =
 +        low_make_reverse_ilist(molt->ilist,molt->atoms.atom,vsite_pbc,
 +                               count,
 +                               bConstr,bSettle,bBCheck,
 +                               ril_mt->index,ril_mt->il,
 +                               bLinkToAllAtoms,TRUE);
 +    
 +    sfree(count);
 +    
 +    return nint_mt;
 +}
 +
 +static void destroy_reverse_ilist(gmx_reverse_ilist_t *ril)
 +{
 +    sfree(ril->index);
 +    sfree(ril->il);
 +}
 +
 +static gmx_reverse_top_t *make_reverse_top(gmx_mtop_t *mtop,gmx_bool bFE,
 +                                           int ***vsite_pbc_molt,
 +                                           gmx_bool bConstr,gmx_bool bSettle,
 +                                           gmx_bool bBCheck,int *nint)
 +{
 +    int mt,i,mb;
 +    gmx_reverse_top_t *rt;
 +    int *nint_mt;
 +    gmx_moltype_t *molt;
 +    int thread;
 +    
 +    snew(rt,1);
 +    
 +    /* Should we include constraints (for SHAKE) in rt? */
 +    rt->bConstr = bConstr;
 +    rt->bSettle = bSettle;
 +    rt->bBCheck = bBCheck;
 +    
 +    rt->bMultiCGmols = FALSE;
 +    snew(nint_mt,mtop->nmoltype);
 +    snew(rt->ril_mt,mtop->nmoltype);
 +    rt->ril_mt_tot_size = 0;
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        if (molt->cgs.nr > 1)
 +        {
 +            rt->bMultiCGmols = TRUE;
 +        }
 +        
 +        /* Make the atom to interaction list for this molecule type */
 +        nint_mt[mt] =
 +            make_reverse_ilist(molt,vsite_pbc_molt ? vsite_pbc_molt[mt] : NULL,
 +                               rt->bConstr,rt->bSettle,rt->bBCheck,FALSE,
 +                               &rt->ril_mt[mt]);
 +        
 +        rt->ril_mt_tot_size += rt->ril_mt[mt].index[molt->atoms.nr];
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"The total size of the atom to interaction index is %d integers\n",rt->ril_mt_tot_size);
 +    }
 +    
 +    *nint = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        *nint += mtop->molblock[mb].nmol*nint_mt[mtop->molblock[mb].type];
 +    }
 +    sfree(nint_mt);
 +    
 +    if (bFE && gmx_mtop_bondeds_free_energy(mtop))
 +    {
 +        rt->ilsort = ilsortFE_UNSORTED;
 +    }
 +    else {
 +        rt->ilsort = ilsortNO_FE;
 +    }
 +    
 +    /* Make a molblock index for fast searching */
 +    snew(rt->mbi,mtop->nmolblock);
 +    rt->nmolblock = mtop->nmolblock;
 +    i = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        rt->mbi[mb].a_start    = i;
 +        i += mtop->molblock[mb].nmol*mtop->molblock[mb].natoms_mol;
 +        rt->mbi[mb].a_end      = i;
 +        rt->mbi[mb].natoms_mol = mtop->molblock[mb].natoms_mol;
 +        rt->mbi[mb].type       = mtop->molblock[mb].type;
 +    }
 +
 +    rt->nthread = gmx_omp_nthreads_get(emntDomdec);
 +    snew(rt->idef_thread,rt->nthread);
 +    if (vsite_pbc_molt != NULL)
 +    {
 +        snew(rt->vsite_pbc,rt->nthread);
 +        snew(rt->vsite_pbc_nalloc,rt->nthread);
 +        for(thread=0; thread<rt->nthread; thread++)
 +        {
 +            snew(rt->vsite_pbc[thread],F_VSITEN-F_VSITE2+1);
 +            snew(rt->vsite_pbc_nalloc[thread],F_VSITEN-F_VSITE2+1);
 +        }
 +    }
 +    snew(rt->nbonded_thread,rt->nthread);
 +    snew(rt->excl_thread,rt->nthread);
 +    snew(rt->excl_count_thread,rt->nthread);
 +    
 +    return rt;
 +}
 +
 +void dd_make_reverse_top(FILE *fplog,
 +                         gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                         gmx_vsite_t *vsite,gmx_constr_t constr,
 +                         t_inputrec *ir,gmx_bool bBCheck)
 +{
 +    int mb,n_recursive_vsite,nexcl,nexcl_icg,a;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t *molt;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nLinking all bonded interactions to atoms\n");
 +    }
 +
 +    /* If normal and/or settle constraints act only within charge groups,
 +     * we can store them in the reverse top and simply assign them to domains.
 +     * Otherwise we need to assign them to multiple domains and set up
 +     * the parallel version constraint algoirthm(s).
 +     */
 +    
 +    dd->reverse_top = make_reverse_top(mtop,ir->efep!=efepNO,
 +                                       vsite ? vsite->vsite_pbc_molt : NULL,
 +                                       !dd->bInterCGcons,!dd->bInterCGsettles,
 +                                       bBCheck,&dd->nbonded_global);
 +    
 +    if (dd->reverse_top->ril_mt_tot_size >= 200000 &&
 +        mtop->mols.nr > 1 &&
 +        mtop->nmolblock == 1 && mtop->molblock[0].nmol == 1)
 +    {
 +        /* mtop comes from a pre Gromacs 4 tpr file */
 +        const char *note="NOTE: The tpr file used for this simulation is in an old format, for less memory usage and possibly more performance create a new tpr file with an up to date version of grompp";
 +        if (fplog)
 +        {
 +            fprintf(fplog,"\n%s\n\n",note);
 +        }
 +        if (DDMASTER(dd))
 +        {
 +            fprintf(stderr,"\n%s\n\n",note);
 +        }
 +    }
 +    
 +    dd->reverse_top->bExclRequired = IR_EXCL_FORCES(*ir);
 +    
 +    nexcl = 0;
 +    dd->n_intercg_excl = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        nexcl += molb->nmol*count_excls(&molt->cgs,&molt->excls,&nexcl_icg);
 +        dd->n_intercg_excl += molb->nmol*nexcl_icg;
 +    }
 +    if (dd->reverse_top->bExclRequired)
 +    {
 +        dd->nbonded_global += nexcl;
 +        if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0 && fplog)
 +        {
 +            fprintf(fplog,"There are %d inter charge-group exclusions,\n"
 +                    "will use an extra communication step for exclusion forces for %s\n",
 +                    dd->n_intercg_excl,eel_names[ir->coulombtype]);
 +        }
 +    }
 +
 +    if (vsite && vsite->n_intercg_vsite > 0)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"There are %d inter charge-group virtual sites,\n"
 +                    "will an extra communication step for selected coordinates and forces\n",
 +            vsite->n_intercg_vsite);
 +        }
 +        init_domdec_vsites(dd,vsite->n_intercg_vsite);
 +    }
 +    
 +    if (dd->bInterCGcons || dd->bInterCGsettles)
 +    {
 +        init_domdec_constraints(dd,mtop,constr);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n");
 +    }
 +}
 +
 +static inline void add_ifunc(int nral,t_iatom *tiatoms,t_ilist *il)
 +{
 +    t_iatom *liatoms;
 +    int     k;
 +    
 +    if (il->nr+1+nral > il->nalloc)
 +    {
 +        il->nalloc = over_alloc_large(il->nr+1+nral);
 +        srenew(il->iatoms,il->nalloc);
 +    }
 +    liatoms = il->iatoms + il->nr;
 +    for(k=0; k<=nral; k++)
 +    {
 +        liatoms[k] = tiatoms[k];
 +    }
 +    il->nr += 1 + nral;
 +}
 +
 +static void add_posres(int mol,int a_mol,const gmx_molblock_t *molb,
 +                       t_iatom *iatoms,const t_iparams *ip_in,
 +                       t_idef *idef)
 +{
 +    int n,a_molb;
 +    t_iparams *ip;
 +    
 +    /* This position restraint has not been added yet,
 +     * so it's index is the current number of position restraints.
 +     */
 +    n = idef->il[F_POSRES].nr/2;
 +    if (n+1 > idef->iparams_posres_nalloc)
 +    {
 +        idef->iparams_posres_nalloc = over_alloc_dd(n+1);
 +        srenew(idef->iparams_posres,idef->iparams_posres_nalloc);
 +    }
 +    ip = &idef->iparams_posres[n];
 +    /* Copy the force constants */
 +    *ip = ip_in[iatoms[0]];
 +    
 +    /* Get the position restraint coordinates from the molblock */
 +    a_molb = mol*molb->natoms_mol + a_mol;
 +    if (a_molb >= molb->nposres_xA)
 +    {
 +        gmx_incons("Not enough position restraint coordinates");
 +    }
 +    ip->posres.pos0A[XX] = molb->posres_xA[a_molb][XX];
 +    ip->posres.pos0A[YY] = molb->posres_xA[a_molb][YY];
 +    ip->posres.pos0A[ZZ] = molb->posres_xA[a_molb][ZZ];
 +    if (molb->nposres_xB > 0)
 +    {
 +        ip->posres.pos0B[XX] = molb->posres_xB[a_molb][XX];
 +        ip->posres.pos0B[YY] = molb->posres_xB[a_molb][YY];
 +        ip->posres.pos0B[ZZ] = molb->posres_xB[a_molb][ZZ];
 +    }
 +    else
 +    {
 +        ip->posres.pos0B[XX] = ip->posres.pos0A[XX];
 +        ip->posres.pos0B[YY] = ip->posres.pos0A[YY];
 +        ip->posres.pos0B[ZZ] = ip->posres.pos0A[ZZ];
 +    }
 +    /* Set the parameter index for idef->iparams_posre */
 +    iatoms[0] = n;
 +}
 +
 +static void add_fbposres(int mol,int a_mol,const gmx_molblock_t *molb,
 +                         t_iatom *iatoms,const t_iparams *ip_in,
 +                         t_idef *idef)
 +{
 +    int n,a_molb;
 +    t_iparams *ip;
 +
 +    /* This flat-bottom position restraint has not been added yet,
 +     * so it's index is the current number of position restraints.
 +     */
 +    n = idef->il[F_FBPOSRES].nr/2;
 +    if (n+1 > idef->iparams_fbposres_nalloc)
 +    {
 +        idef->iparams_fbposres_nalloc = over_alloc_dd(n+1);
 +        srenew(idef->iparams_fbposres,idef->iparams_fbposres_nalloc);
 +    }
 +    ip = &idef->iparams_fbposres[n];
 +    /* Copy the force constants */
 +    *ip = ip_in[iatoms[0]];
 +
 +    /* Get the position restriant coordinats from the molblock */
 +    a_molb = mol*molb->natoms_mol + a_mol;
 +    if (a_molb >= molb->nposres_xA)
 +    {
 +        gmx_incons("Not enough position restraint coordinates");
 +    }
 +    /* Take reference positions from A position of normal posres */
 +    ip->fbposres.pos0[XX] = molb->posres_xA[a_molb][XX];
 +    ip->fbposres.pos0[YY] = molb->posres_xA[a_molb][YY];
 +    ip->fbposres.pos0[ZZ] = molb->posres_xA[a_molb][ZZ];
 +
 +    /* Note: no B-type for flat-bottom posres */
 +
 +    /* Set the parameter index for idef->iparams_posre */
 +    iatoms[0] = n;
 +}
 +
 +static void add_vsite(gmx_ga2la_t ga2la,int *index,int *rtil,
 +                      int ftype,int nral,
 +                      gmx_bool bHomeA,int a,int a_gl,int a_mol,
 +                      t_iatom *iatoms,
 +                      t_idef *idef,int **vsite_pbc,int *vsite_pbc_nalloc)
 +{
 +    int  k,ak_gl,vsi,pbc_a_mol;
 +    t_iatom tiatoms[1+MAXATOMLIST],*iatoms_r;
 +    int  j,ftype_r,nral_r;
 +    
 +    /* Copy the type */
 +    tiatoms[0] = iatoms[0];
 +
 +    if (bHomeA)
 +    {
 +        /* We know the local index of the first atom */
 +        tiatoms[1] = a;
 +    }
 +    else
 +    {
 +        /* Convert later in make_local_vsites */
 +        tiatoms[1] = -a_gl - 1;
 +    }
 +    
 +    for(k=2; k<1+nral; k++)
 +    {
 +        ak_gl = a_gl + iatoms[k] - a_mol;
 +        if (!ga2la_get_home(ga2la,ak_gl,&tiatoms[k]))
 +        {
 +            /* Copy the global index, convert later in make_local_vsites */
 +            tiatoms[k] = -(ak_gl + 1);
 +        }
 +    }
 +    
 +    /* Add this interaction to the local topology */
 +    add_ifunc(nral,tiatoms,&idef->il[ftype]);
 +    if (vsite_pbc)
 +    {
 +        vsi = idef->il[ftype].nr/(1+nral) - 1;
 +        if (vsi >= vsite_pbc_nalloc[ftype-F_VSITE2])
 +        {
 +            vsite_pbc_nalloc[ftype-F_VSITE2] = over_alloc_large(vsi+1);
 +            srenew(vsite_pbc[ftype-F_VSITE2],vsite_pbc_nalloc[ftype-F_VSITE2]);
 +        }
 +        if (bHomeA)
 +        {
 +            pbc_a_mol = iatoms[1+nral+1];
 +            if (pbc_a_mol < 0)
 +            {
 +                /* The pbc flag is one of the following two options:
 +                 * -2: vsite and all constructing atoms are within the same cg, no pbc
 +                 * -1: vsite and its first constructing atom are in the same cg, do pbc
 +                 */
 +                vsite_pbc[ftype-F_VSITE2][vsi] = pbc_a_mol;
 +            }
 +            else
 +            {
 +                /* Set the pbc atom for this vsite so we can make its pbc 
 +                 * identical to the rest of the atoms in its charge group.
 +                 * Since the order of the atoms does not change within a charge
 +                 * group, we do not need the global to local atom index.
 +                 */
 +                vsite_pbc[ftype-F_VSITE2][vsi] = a + pbc_a_mol - iatoms[1];
 +            }
 +        }
 +        else
 +        {
 +            /* This vsite is non-home (required for recursion),
 +             * and therefore there is no charge group to match pbc with.
 +             * But we always turn on full_pbc to assure that higher order
 +             * recursion works correctly.
 +             */
 +            vsite_pbc[ftype-F_VSITE2][vsi] = -1;
 +        }
 +    }
 +    
 +    if (iatoms[1+nral])
 +    {
 +        /* Check for recursion */
 +        for(k=2; k<1+nral; k++)
 +        {
 +            if ((iatoms[1+nral] & (2<<k)) && (tiatoms[k] < 0))
 +            {
 +                /* This construction atoms is a vsite and not a home atom */
 +                if (gmx_debug_at)
 +                {
 +                    fprintf(debug,"Constructing atom %d of vsite atom %d is a vsite and non-home\n",iatoms[k]+1,a_mol+1);
 +                }
 +                /* Find the vsite construction */
 +                
 +                /* Check all interactions assigned to this atom */
 +                j = index[iatoms[k]];
 +                while (j < index[iatoms[k]+1])
 +                {
 +                    ftype_r = rtil[j++];
 +                    nral_r = NRAL(ftype_r);
 +                    if (interaction_function[ftype_r].flags & IF_VSITE)
 +                    {
 +                        /* Add this vsite (recursion) */
 +                        add_vsite(ga2la,index,rtil,ftype_r,nral_r,
 +                                  FALSE,-1,a_gl+iatoms[k]-iatoms[1],iatoms[k],
 +                                  rtil+j,idef,vsite_pbc,vsite_pbc_nalloc);
 +                        j += 1 + nral_r + 2;
 +                    }
 +                    else
 +                    {
 +                        j += 1 + nral_r;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static void make_la2lc(gmx_domdec_t *dd)
 +{
 +    int *cgindex,*la2lc,cg,a;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    if (dd->nat_tot > dd->la2lc_nalloc)
 +    {
 +        dd->la2lc_nalloc = over_alloc_dd(dd->nat_tot);
 +        snew(dd->la2lc,dd->la2lc_nalloc);
 +    }
 +    la2lc = dd->la2lc;
 +    
 +    /* Make the local atom to local cg index */
 +    for(cg=0; cg<dd->ncg_tot; cg++)
 +    {
 +        for(a=cgindex[cg]; a<cgindex[cg+1]; a++)
 +        {
 +            la2lc[a] = cg;
 +        }
 +    }
 +}
 +
 +static real dd_dist2(t_pbc *pbc_null,rvec *cg_cm,const int *la2lc,int i,int j)
 +{
 +    rvec dx;
 +    
 +    if (pbc_null)
 +    {
 +        pbc_dx_aiuc(pbc_null,cg_cm[la2lc[i]],cg_cm[la2lc[j]],dx);
 +    }
 +    else
 +    {
 +        rvec_sub(cg_cm[la2lc[i]],cg_cm[la2lc[j]],dx);
 +    }
 +    
 +    return norm2(dx);
 +}
 +
 +/* Append the nsrc t_blocka block structures in src to *dest */
 +static void combine_blocka(t_blocka *dest,const t_blocka *src,int nsrc)
 +{
 +    int ni,na,s,i;
 +
 +    ni = src[nsrc-1].nr;
 +    na = 0;
 +    for(s=0; s<nsrc; s++)
 +    {
 +        na += src[s].nra;
 +    }
 +    if (ni + 1 > dest->nalloc_index)
 +    {
 +        dest->nalloc_index = over_alloc_large(ni+1);
 +        srenew(dest->index,dest->nalloc_index);
 +    }
 +    if (dest->nra + na > dest->nalloc_a)
 +    {
 +        dest->nalloc_a = over_alloc_large(dest->nra+na);
 +        srenew(dest->a,dest->nalloc_a);
 +    }
 +    for(s=0; s<nsrc; s++)
 +    {
 +        for(i=dest->nr+1; i<src[s].nr+1; i++)
 +        {
 +            dest->index[i] = dest->nra + src[s].index[i];
 +        }
 +        for(i=0; i<src[s].nra; i++)
 +        {
 +            dest->a[dest->nra+i] = src[s].a[i];
 +        }
 +        dest->nr   = src[s].nr;
 +        dest->nra += src[s].nra;
 +    }
 +}
 +
 +/* Append the nsrc t_idef structures in src to *dest,
 + * virtual sites need special attention, as pbc info differs per vsite.
 + */
 +static void combine_idef(t_idef *dest,const t_idef *src,int nsrc,
 +                         gmx_vsite_t *vsite,int ***vsite_pbc_t)
 +{
 +    int ftype,n,s,i;
 +    t_ilist *ild;
 +    const t_ilist *ils;
 +    gmx_bool vpbc;
 +    int nral1=0,ftv=0;
 +
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        n = 0;
 +        for(s=0; s<nsrc; s++)
 +        {
 +            n += src[s].il[ftype].nr;
 +        }
 +        if (n > 0)
 +        {
 +            ild = &dest->il[ftype];
 +
 +            if (ild->nr + n > ild->nalloc)
 +            {
 +                ild->nalloc = over_alloc_large(ild->nr+n);
 +                srenew(ild->iatoms,ild->nalloc);
 +            }
 +
 +            vpbc = ((interaction_function[ftype].flags & IF_VSITE) &&
 +                    vsite->vsite_pbc_loc != NULL);
 +            if (vpbc)
 +            {
 +                nral1 = 1 + NRAL(ftype);
 +                ftv = ftype - F_VSITE2;
 +                if ((ild->nr + n)/nral1 > vsite->vsite_pbc_loc_nalloc[ftv])
 +                {
 +                    vsite->vsite_pbc_loc_nalloc[ftv] =
 +                        over_alloc_large((ild->nr + n)/nral1);
 +                    srenew(vsite->vsite_pbc_loc[ftv],
 +                           vsite->vsite_pbc_loc_nalloc[ftv]);
 +                }
 +            }
 +
 +            for(s=0; s<nsrc; s++)
 +            {
 +                ils = &src[s].il[ftype];
 +                for(i=0; i<ils->nr; i++)
 +                {
 +                    ild->iatoms[ild->nr+i] = ils->iatoms[i];
 +                }
 +                if (vpbc)
 +                {
 +                    for(i=0; i<ils->nr; i+=nral1)
 +                    {
 +                        vsite->vsite_pbc_loc[ftv][(ild->nr+i)/nral1] =
 +                            vsite_pbc_t[s][ftv][i/nral1];
 +                    }
 +                }
 +                
 +                ild->nr += ils->nr;
 +            }
 +        }
 +    }
 +
 +    /* Position restraints need an additional treatment */
 +    if (dest->il[F_POSRES].nr > 0)
 +    {
 +        n = dest->il[F_POSRES].nr/2;
 +        if (n > dest->iparams_posres_nalloc)
 +        {
 +            dest->iparams_posres_nalloc = over_alloc_large(n);
 +            srenew(dest->iparams_posres,dest->iparams_posres_nalloc);
 +        }
 +        /* Set n to the number of original position restraints in dest */
 +        for(s=0; s<nsrc; s++)
 +        {
 +            n -= src[s].il[F_POSRES].nr/2;
 +        }
 +        for(s=0; s<nsrc; s++)
 +        {
 +            for(i=0; i<src[s].il[F_POSRES].nr/2; i++)
 +            {
 +                /* Correct the index into iparams_posres */
 +                dest->il[F_POSRES].iatoms[n*2] = n;
 +                /* Copy the position restraint force parameters */
 +                dest->iparams_posres[n] = src[s].iparams_posres[i];
 +                n++;
 +            }
 +        }
 +    }
 +}
 +
 +/* This function looks up and assigns bonded interactions for zone iz.
 + * With thread parallelizing each thread acts on a different atom range:
 + * at_start to at_end.
 + */
 +static int make_bondeds_zone(gmx_domdec_t *dd,
 +                             const gmx_domdec_zones_t *zones,
 +                             const gmx_molblock_t *molb,
 +                             gmx_bool bRCheckMB,ivec rcheck,gmx_bool bRCheck2B,
 +                             real rc2,
 +                             int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
 +                             const t_iparams *ip_in,
 +                             t_idef *idef,gmx_vsite_t *vsite,
 +                             int **vsite_pbc,
 +                             int *vsite_pbc_nalloc,
 +                             int iz,int nzone,
 +                             int at_start,int at_end)
 +{
 +    int i,i_gl,mb,mt,mol,i_mol,j,ftype,nral,d,k;
 +    int *index,*rtil;
 +    t_iatom *iatoms,tiatoms[1+MAXATOMLIST];
 +    gmx_bool bBCheck,bUse,bLocal;
 +    ivec k_zero,k_plus;
 +    gmx_ga2la_t ga2la;
 +    int  a_loc;
 +    int  kz;
 +    int  nizone;
 +    const gmx_domdec_ns_ranges_t *izone;
 +    gmx_reverse_top_t *rt;
 +    int nbonded_local;
 +
 +    nizone = zones->nizone;
 +    izone  = zones->izone;
 +    
 +    rt = dd->reverse_top;
 +    
 +    bBCheck = rt->bBCheck;
 +    
 +    nbonded_local = 0;
 +    
 +    ga2la = dd->ga2la;
 +
 +    for(i=at_start; i<at_end; i++)
 +    {
 +        /* Get the global atom number */
 +        i_gl = dd->gatindex[i];
 +        global_atomnr_to_moltype_ind(rt,i_gl,&mb,&mt,&mol,&i_mol);
 +        /* Check all interactions assigned to this atom */
 +        index = rt->ril_mt[mt].index;
 +        rtil  = rt->ril_mt[mt].il;
 +        j = index[i_mol];
 +        while (j < index[i_mol+1])
 +        {
 +            ftype  = rtil[j++];
 +            iatoms = rtil + j;
 +            nral = NRAL(ftype);
 +            if (ftype == F_SETTLE)
 +            {
 +                /* Settles are only in the reverse top when they
 +                 * operate within a charge group. So we can assign
 +                 * them without checks. We do this only for performance
 +                 * reasons; it could be handled by the code below.
 +                 */
 +                if (iz == 0)
 +                {
 +                    /* Home zone: add this settle to the local topology */
 +                    tiatoms[0] = iatoms[0];
 +                    tiatoms[1] = i;
 +                    tiatoms[2] = i + iatoms[2] - iatoms[1];
 +                    tiatoms[3] = i + iatoms[3] - iatoms[1];
 +                    add_ifunc(nral,tiatoms,&idef->il[ftype]);
 +                    nbonded_local++;
 +                }
 +                j += 1 + nral;
 +            }
 +            else if (interaction_function[ftype].flags & IF_VSITE)
 +            {
 +                /* The vsite construction goes where the vsite itself is */
 +                if (iz == 0)
 +                {
 +                    add_vsite(dd->ga2la,index,rtil,ftype,nral,
 +                              TRUE,i,i_gl,i_mol,
 +                              iatoms,idef,vsite_pbc,vsite_pbc_nalloc);
 +                }
 +                j += 1 + nral + 2;
 +            }
 +            else
 +            {
 +                /* Copy the type */
 +                tiatoms[0] = iatoms[0];
 +
 +                if (nral == 1)
 +                {
 +                    /* Assign single-body interactions to the home zone */
 +                    if (iz == 0)
 +                    {
 +                        bUse = TRUE;
 +                            tiatoms[1] = i;
 +                            if (ftype == F_POSRES)
 +                            {
 +                                add_posres(mol,i_mol,&molb[mb],tiatoms,ip_in,
 +                                           idef);
 +                            }
 +                            else if (ftype == F_FBPOSRES)
 +                            {
 +                                add_fbposres(mol,i_mol,&molb[mb],tiatoms,ip_in,
 +                                             idef);
 +                            }
 +                    }
 +                    else
 +                    {
 +                        bUse = FALSE;
 +                    }
 +                }
 +                else if (nral == 2)
 +                {
 +                    /* This is a two-body interaction, we can assign
 +                     * analogous to the non-bonded assignments.
 +                     */
 +                    if (!ga2la_get(ga2la,i_gl+iatoms[2]-i_mol,&a_loc,&kz))
 +                    {
 +                        bUse = FALSE;
 +                    }
 +                    else
 +                    {
 +                        if (kz >= nzone)
 +                        {
 +                            kz -= nzone;
 +                        }
 +                        /* Check zone interaction assignments */
 +                        bUse = ((iz < nizone && iz <= kz &&
 +                                 izone[iz].j0 <= kz && kz < izone[iz].j1) ||
 +                                (kz < nizone && iz >  kz &&
 +                                 izone[kz].j0 <= iz && iz < izone[kz].j1));
 +                        if (bUse)
 +                        {
 +                            tiatoms[1] = i;
 +                            tiatoms[2] = a_loc;
 +                            /* If necessary check the cgcm distance */
 +                            if (bRCheck2B &&
 +                                dd_dist2(pbc_null,cg_cm,la2lc,
 +                                         tiatoms[1],tiatoms[2]) >= rc2)
 +                            {
 +                                bUse = FALSE;
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* Assign this multi-body bonded interaction to
 +                     * the local node if we have all the atoms involved
 +                     * (local or communicated) and the minimum zone shift
 +                     * in each dimension is zero, for dimensions
 +                     * with 2 DD cells an extra check may be necessary.
 +                     */
 +                    bUse = TRUE;
 +                    clear_ivec(k_zero);
 +                    clear_ivec(k_plus);
 +                    for(k=1; k<=nral && bUse; k++)
 +                    {
 +                        bLocal = ga2la_get(ga2la,i_gl+iatoms[k]-i_mol,
 +                                           &a_loc,&kz);
 +                        if (!bLocal || kz >= zones->n)
 +                        {
 +                            /* We do not have this atom of this interaction
 +                             * locally, or it comes from more than one cell
 +                             * away.
 +                             */
 +                            bUse = FALSE;
 +                        }
 +                        else
 +                        {
 +                            tiatoms[k] = a_loc;
 +                            for(d=0; d<DIM; d++)
 +                            {
 +                                if (zones->shift[kz][d] == 0)
 +                                {
 +                                    k_zero[d] = k;
 +                                }
 +                                else
 +                                {
 +                                    k_plus[d] = k;
 +                                }
 +                            }
 +                        }
 +                    }
 +                    bUse = (bUse &&
 +                            k_zero[XX] && k_zero[YY] && k_zero[ZZ]);
 +                    if (bRCheckMB)
 +                    {
 +                        for(d=0; (d<DIM && bUse); d++)
 +                        {
 +                            /* Check if the cg_cm distance falls within
 +                             * the cut-off to avoid possible multiple
 +                             * assignments of bonded interactions.
 +                             */
 +                            if (rcheck[d] && 
 +                                k_plus[d] &&
 +                                dd_dist2(pbc_null,cg_cm,la2lc,
 +                                         tiatoms[k_zero[d]],tiatoms[k_plus[d]]) >= rc2)
 +                            {
 +                                bUse = FALSE;
 +                            }
 +                        }
 +                    }
 +                }
 +                if (bUse)
 +                {
 +                    /* Add this interaction to the local topology */
 +                    add_ifunc(nral,tiatoms,&idef->il[ftype]);
 +                    /* Sum so we can check in global_stat
 +                     * if we have everything.
 +                     */
 +                    if (bBCheck ||
 +                        !(interaction_function[ftype].flags & IF_LIMZERO))
 +                    {
 +                        nbonded_local++;
 +                    }
 +                }
 +                j += 1 + nral;
 +            }
 +        }
 +    }
 +
 +    return nbonded_local;
 +}
 +
 +static void set_no_exclusions_zone(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
 +                                   int iz,t_blocka *lexcls)
 +{
 +    int  a0,a1,a;
 +    
 +    a0 = dd->cgindex[zones->cg_range[iz]];
 +    a1 = dd->cgindex[zones->cg_range[iz+1]];
 +
 +    for(a=a0+1; a<a1+1; a++)
 +    {
 +        lexcls->index[a] = lexcls->nra;
 +    }
 +}
 +
 +static int make_exclusions_zone(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
 +                                const gmx_moltype_t *moltype,
 +                                gmx_bool bRCheck,real rc2,
 +                                int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
 +                                const int *cginfo,
 +                                t_blocka *lexcls,
 +                                int iz,
 +                                int cg_start,int cg_end)
 +{
 +    int  nizone,n,count,jla0,jla1,jla;
 +    int  cg,la0,la1,la,a_gl,mb,mt,mol,a_mol,j,aj_mol;
 +    const t_blocka *excls;
 +    gmx_ga2la_t ga2la;
 +    int  a_loc;
 +    int  cell;
 +
 +    ga2la = dd->ga2la;
 +
 +    jla0 = dd->cgindex[zones->izone[iz].jcg0];
 +    jla1 = dd->cgindex[zones->izone[iz].jcg1];
 +
 +    /* We set the end index, but note that we might not start at zero here */
 +    lexcls->nr = dd->cgindex[cg_end];
 +
 +    n = lexcls->nra;
 +    count = 0;
 +    for(cg=cg_start; cg<cg_end; cg++)
 +    {
 +        /* Here we assume the number of exclusions in one charge group
 +         * is never larger than 1000.
 +         */
 +        if (n+1000 > lexcls->nalloc_a)
 +        {
 +            lexcls->nalloc_a = over_alloc_large(n+1000);
 +            srenew(lexcls->a,lexcls->nalloc_a);
 +        }
 +        la0 = dd->cgindex[cg];
 +        la1 = dd->cgindex[cg+1];
 +        if (GET_CGINFO_EXCL_INTER(cginfo[cg]) ||
 +            !GET_CGINFO_EXCL_INTRA(cginfo[cg]))
 +        {
 +            /* Copy the exclusions from the global top */
 +            for(la=la0; la<la1; la++) {
 +                lexcls->index[la] = n;
 +                a_gl = dd->gatindex[la];
 +                global_atomnr_to_moltype_ind(dd->reverse_top,a_gl,&mb,&mt,&mol,&a_mol);
 +                excls = &moltype[mt].excls;
 +                for(j=excls->index[a_mol]; j<excls->index[a_mol+1]; j++)
 +                {
 +                    aj_mol = excls->a[j];
 +                    /* This computation of jla is only correct intra-cg */
 +                    jla = la + aj_mol - a_mol;
 +                    if (jla >= la0 && jla < la1)
 +                    {
 +                        /* This is an intra-cg exclusion. We can skip
 +                         *  the global indexing and distance checking.
 +                         */
 +                        /* Intra-cg exclusions are only required
 +                         * for the home zone.
 +                         */
 +                        if (iz == 0)
 +                        {
 +                            lexcls->a[n++] = jla;
 +                            /* Check to avoid double counts */
 +                            if (jla > la)
 +                            {
 +                                count++;
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* This is a inter-cg exclusion */
 +                        /* Since exclusions are pair interactions,
 +                         * just like non-bonded interactions,
 +                         * they can be assigned properly up
 +                         * to the DD cutoff (not cutoff_min as
 +                         * for the other bonded interactions).
 +                         */
 +                        if (ga2la_get(ga2la,a_gl+aj_mol-a_mol,&jla,&cell))
 +                        {
 +                            if (iz == 0 && cell == 0)
 +                            {
 +                                lexcls->a[n++] = jla;
 +                                /* Check to avoid double counts */
 +                                if (jla > la)
 +                                {
 +                                    count++;
 +                                }
 +                            }
 +                            else if (jla >= jla0 && jla < jla1 &&
 +                                     (!bRCheck ||
 +                                      dd_dist2(pbc_null,cg_cm,la2lc,la,jla) < rc2))
 +                            {
 +                                /* jla > la, since jla0 > la */
 +                                lexcls->a[n++] = jla;
 +                                count++;
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* There are no inter-cg excls and this cg is self-excluded.
 +             * These exclusions are only required for zone 0,
 +             * since other zones do not see themselves.
 +             */
 +            if (iz == 0)
 +            {
 +                for(la=la0; la<la1; la++)
 +                {
 +                    lexcls->index[la] = n;
 +                    for(j=la0; j<la1; j++)
 +                    {
 +                        lexcls->a[n++] = j;
 +                    }
 +                }
 +                count += ((la1 - la0)*(la1 - la0 - 1))/2;
 +            }
 +            else
 +            {
 +                /* We don't need exclusions for this cg */
 +                for(la=la0; la<la1; la++)
 +                {
 +                    lexcls->index[la] = n;
 +                }
 +            }
 +        }
 +    }
 +
 +    lexcls->index[lexcls->nr] = n;
 +    lexcls->nra = n;
 +
 +    return count;
 +}
 +
 +static void check_alloc_index(t_blocka *ba,int nindex_max)
 +{
 +    if (nindex_max+1 > ba->nalloc_index)
 +    {
 +        ba->nalloc_index = over_alloc_dd(nindex_max+1);
 +        srenew(ba->index,ba->nalloc_index);
 +    }
 +}
 +
 +static void check_exclusions_alloc(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
 +                                   t_blocka *lexcls)
 +{
 +    int nr;
 +    int thread;
 +
 +    nr = dd->cgindex[zones->izone[zones->nizone-1].cg1];
 +
 +    check_alloc_index(lexcls,nr);
 +
 +    for(thread=1; thread<dd->reverse_top->nthread; thread++)
 +    {
 +        check_alloc_index(&dd->reverse_top->excl_thread[thread],nr);
 +    }
 +}
 +
 +static void finish_local_exclusions(gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
 +                                    t_blocka *lexcls)
 +{
 +    int la0,la;
 +
 +    lexcls->nr = dd->cgindex[zones->izone[zones->nizone-1].cg1];
 +
 +    if (dd->n_intercg_excl == 0)
 +    {
 +        /* There are no exclusions involving non-home charge groups,
 +         * but we need to set the indices for neighborsearching.
 +         */
 +        la0 = dd->cgindex[zones->izone[0].cg1];
 +        for(la=la0; la<lexcls->nr; la++)
 +        {
 +            lexcls->index[la] = lexcls->nra;
 +        }
 +
 +        /* nr is only used to loop over the exclusions for Ewald and RF,
 +         * so we can set it to the number of home atoms for efficiency.
 +         */
 +        lexcls->nr = dd->cgindex[zones->izone[0].cg1];
 +    }
 +}
 +
 +static void clear_idef(t_idef *idef)
 +{
 +    int  ftype;
 +
 +     /* Clear the counts */
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        idef->il[ftype].nr = 0;
 +    }
 +}
 +
 +static int make_local_bondeds_excls(gmx_domdec_t *dd,
 +                                    gmx_domdec_zones_t *zones,
 +                                    const gmx_mtop_t *mtop,
 +                                    const int *cginfo,
 +                                    gmx_bool bRCheckMB,ivec rcheck,gmx_bool bRCheck2B,
 +                                    real rc,
 +                                    int *la2lc,t_pbc *pbc_null,rvec *cg_cm,
 +                                    t_idef *idef,gmx_vsite_t *vsite,
 +                                    t_blocka *lexcls,int *excl_count)
 +{
 +    int  nzone_bondeds,nzone_excl;
 +    int  iz,cg0,cg1;
 +    real rc2;
 +    int  nbonded_local;
 +    int  thread;
 +    gmx_reverse_top_t *rt;
 +
 +    if (dd->reverse_top->bMultiCGmols)
 +    {
 +        nzone_bondeds = zones->n;
 +    }
 +    else
 +    {
 +        /* Only single charge group molecules, so interactions don't
 +         * cross zone boundaries and we only need to assign in the home zone.
 +         */
 +        nzone_bondeds = 1;
 +    }
 +
 +    if (dd->n_intercg_excl > 0)
 +    {
 +        /* We only use exclusions from i-zones to i- and j-zones */
 +        nzone_excl = zones->nizone;
 +    }
 +    else
 +    {
 +        /* There are no inter-cg exclusions and only zone 0 sees itself */
 +        nzone_excl = 1;
 +    }
 +
 +    check_exclusions_alloc(dd,zones,lexcls);
 +    
 +    rt = dd->reverse_top;
 +
 +    rc2 = rc*rc;
 +    
 +    /* Clear the counts */
 +    clear_idef(idef);
 +    nbonded_local = 0;
 +
 +    lexcls->nr    = 0;
 +    lexcls->nra   = 0;
 +    *excl_count   = 0;
 +
 +    for(iz=0; iz<nzone_bondeds; iz++)
 +    {
 +        cg0 = zones->cg_range[iz];
 +        cg1 = zones->cg_range[iz+1];
 +
 +#pragma omp parallel for num_threads(rt->nthread) schedule(static)
 +        for(thread=0; thread<rt->nthread; thread++)
 +        {
 +            int cg0t,cg1t;
 +            t_idef *idef_t;
 +            int ftype;
 +            int **vsite_pbc;
 +            int *vsite_pbc_nalloc;
 +            t_blocka *excl_t;
 +
 +            cg0t = cg0 + ((cg1 - cg0)* thread   )/rt->nthread;
 +            cg1t = cg0 + ((cg1 - cg0)*(thread+1))/rt->nthread;
 +
 +            if (thread == 0)
 +            {
 +                idef_t = idef;
 +            }
 +            else
 +            {
 +                idef_t = &rt->idef_thread[thread];
 +                clear_idef(idef_t);
 +            }
 +
-     /* For gmx_vsite_t everything 0 should work (without pbc) */
-     snew(vsite,1);
++            if (vsite && vsite->bHaveChargeGroups && vsite->n_intercg_vsite > 0)
 +            {
 +                if (thread == 0)
 +                {
 +                    vsite_pbc        = vsite->vsite_pbc_loc;
 +                    vsite_pbc_nalloc = vsite->vsite_pbc_loc_nalloc;
 +                }
 +                else
 +                {
 +                    vsite_pbc        = rt->vsite_pbc[thread];
 +                    vsite_pbc_nalloc = rt->vsite_pbc_nalloc[thread];
 +                }
 +            }
 +            else
 +            {
 +                vsite_pbc        = NULL;
 +                vsite_pbc_nalloc = NULL;
 +            }
 +
 +            rt->nbonded_thread[thread] =
 +                make_bondeds_zone(dd,zones,
 +                                  mtop->molblock,
 +                                  bRCheckMB,rcheck,bRCheck2B,rc2,
 +                                  la2lc,pbc_null,cg_cm,idef->iparams,
 +                                  idef_t,
 +                                  vsite,vsite_pbc,vsite_pbc_nalloc,
 +                                  iz,zones->n,
 +                                  dd->cgindex[cg0t],dd->cgindex[cg1t]);
 +
 +            if (iz < nzone_excl)
 +            {
 +                if (thread == 0)
 +                {
 +                    excl_t = lexcls;
 +                }
 +                else
 +                {
 +                    excl_t = &rt->excl_thread[thread];
 +                    excl_t->nr  = 0;
 +                    excl_t->nra = 0;
 +                }
 +
 +                rt->excl_count_thread[thread] =
 +                    make_exclusions_zone(dd,zones,
 +                                         mtop->moltype,bRCheck2B,rc2,
 +                                         la2lc,pbc_null,cg_cm,cginfo,
 +                                         excl_t,
 +                                         iz,
 +                                         cg0t,cg1t);
 +            }
 +        }
 +
 +        if (rt->nthread > 1)
 +        {
 +            combine_idef(idef,rt->idef_thread+1,rt->nthread-1,
 +                         vsite,rt->vsite_pbc+1);
 +        }
 +
 +        for(thread=0; thread<rt->nthread; thread++)
 +        {
 +            nbonded_local += rt->nbonded_thread[thread];
 +        }
 +
 +        if (iz < nzone_excl)
 +        {
 +            if (rt->nthread > 1)
 +            {
 +                combine_blocka(lexcls,rt->excl_thread+1,rt->nthread-1);
 +            }
 +
 +            for(thread=0; thread<rt->nthread; thread++)
 +            {
 +                *excl_count += rt->excl_count_thread[thread];
 +            }
 +        }
 +    }
 +
 +    /* Some zones might not have exclusions, but some code still needs to
 +     * loop over the index, so we set the indices here.
 +     */
 +    for(iz=nzone_excl; iz<zones->nizone; iz++)
 +    {
 +        set_no_exclusions_zone(dd,zones,iz,lexcls);
 +    }
 +
 +    finish_local_exclusions(dd,zones,lexcls);
 +    if (debug)
 +    {
 +        fprintf(debug,"We have %d exclusions, check count %d\n",
 +                lexcls->nra,*excl_count);
 +    }
 +    
 +    return nbonded_local;
 +}
 +
 +void dd_make_local_cgs(gmx_domdec_t *dd,t_block *lcgs)
 +{
 +  lcgs->nr    = dd->ncg_tot;
 +  lcgs->index = dd->cgindex;
 +}
 +
 +void dd_make_local_top(FILE *fplog,
 +                       gmx_domdec_t *dd,gmx_domdec_zones_t *zones,
 +                       int npbcdim,matrix box,
 +                       rvec cellsize_min,ivec npulse,
 +                       t_forcerec *fr,
 +                       rvec *cgcm_or_x,
 +                       gmx_vsite_t *vsite,
 +                       gmx_mtop_t *mtop,gmx_localtop_t *ltop)
 +{
 +    gmx_bool bUniqueExcl,bRCheckMB,bRCheck2B,bRCheckExcl;
 +    real rc=-1;
 +    ivec rcheck;
 +    int  d,nexcl;
 +    t_pbc pbc,*pbc_null=NULL;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Making local topology\n");
 +    }
 +    
 +    dd_make_local_cgs(dd,&ltop->cgs);
 +    
 +    bRCheckMB   = FALSE;
 +    bRCheck2B   = FALSE;
 +    bRCheckExcl = FALSE;
 +    
 +    if (dd->reverse_top->bMultiCGmols)
 +    {
 +        /* We need to check to which cell bondeds should be assigned */
 +        rc = dd_cutoff_twobody(dd);
 +        if (debug)
 +        {
 +            fprintf(debug,"Two-body bonded cut-off distance is %g\n",rc);
 +        }
 +        
 +        /* Should we check cg_cm distances when assigning bonded interactions? */
 +        for(d=0; d<DIM; d++)
 +        {
 +            rcheck[d] = FALSE;
 +            /* Only need to check for dimensions where the part of the box
 +             * that is not communicated is smaller than the cut-off.
 +             */
 +            if (d < npbcdim && dd->nc[d] > 1 &&
 +                (dd->nc[d] - npulse[d])*cellsize_min[d] < 2*rc)
 +            {
 +                if (dd->nc[d] == 2)
 +                {
 +                    rcheck[d] = TRUE;
 +                    bRCheckMB = TRUE;
 +                }
 +                /* Check for interactions between two atoms,
 +                 * where we can allow interactions up to the cut-off,
 +                 * instead of up to the smallest cell dimension.
 +                 */
 +                bRCheck2B = TRUE;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d cellmin %f bonded rcheck[%d] = %d, bRCheck2B = %d\n",
 +                        d,cellsize_min[d],d,rcheck[d],bRCheck2B);
 +            }
 +        }
 +        if (dd->reverse_top->bExclRequired)
 +        {
 +            bRCheckExcl = bRCheck2B;
 +        }
 +        else
 +        {
 +            /* If we don't have forces on exclusions,
 +             * we don't care about exclusions being assigned mulitple times.
 +             */
 +            bRCheckExcl = FALSE;
 +        }
 +        if (bRCheckMB || bRCheck2B)
 +        {
 +            make_la2lc(dd);
 +            if (fr->bMolPBC)
 +            {
 +                set_pbc_dd(&pbc,fr->ePBC,dd,TRUE,box);
 +                pbc_null = &pbc;
 +            }
 +            else
 +            {
 +                pbc_null = NULL;
 +            }
 +        }
 +    }
 +        
 +    dd->nbonded_local =
 +        make_local_bondeds_excls(dd,zones,mtop,fr->cginfo,
 +                                 bRCheckMB,rcheck,bRCheck2B,rc,
 +                                 dd->la2lc,
 +                                 pbc_null,cgcm_or_x,
 +                                 &ltop->idef,vsite,
 +                                 &ltop->excls,&nexcl);
 +    
 +    /* The ilist is not sorted yet,
 +     * we can only do this when we have the charge arrays.
 +     */
 +    ltop->idef.ilsort = ilsortUNKNOWN;
 +    
 +    if (dd->reverse_top->bExclRequired)
 +    {
 +        dd->nbonded_local += nexcl;
 +
 +        forcerec_set_excl_load(fr,ltop,NULL);
 +    }
 +    
 +    ltop->atomtypes  = mtop->atomtypes;
 +    
 +    /* For an error message only */
 +    dd->reverse_top->err_top_global = mtop;
 +    dd->reverse_top->err_top_local  = ltop;
 +}
 +
 +void dd_sort_local_top(gmx_domdec_t *dd,t_mdatoms *mdatoms,
 +                       gmx_localtop_t *ltop)
 +{
 +    if (dd->reverse_top->ilsort == ilsortNO_FE)
 +    {
 +        ltop->idef.ilsort = ilsortNO_FE;
 +    }
 +    else
 +    {
 +        gmx_sort_ilist_fe(&ltop->idef,mdatoms->chargeA,mdatoms->chargeB);
 +    }
 +}
 +
 +gmx_localtop_t *dd_init_local_top(gmx_mtop_t *top_global)
 +{
 +    gmx_localtop_t *top;
 +    int i;
 +    
 +    snew(top,1);
 +    
 +    top->idef.ntypes   = top_global->ffparams.ntypes;
 +    top->idef.atnr     = top_global->ffparams.atnr;
 +    top->idef.functype = top_global->ffparams.functype;
 +    top->idef.iparams  = top_global->ffparams.iparams;
 +    top->idef.fudgeQQ  = top_global->ffparams.fudgeQQ;
 +    top->idef.cmap_grid= top_global->ffparams.cmap_grid;
 +    
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        top->idef.il[i].iatoms = NULL;
 +        top->idef.il[i].nalloc = 0;
 +    }
 +    top->idef.ilsort   = ilsortUNKNOWN;
 +    
 +    return top;
 +}
 +
 +void dd_init_local_state(gmx_domdec_t *dd,
 +                         t_state *state_global,t_state *state_local)
 +{
 +    int buf[NITEM_DD_INIT_LOCAL_STATE];
 +    
 +    if (DDMASTER(dd))
 +    {
 +        buf[0] = state_global->flags;
 +        buf[1] = state_global->ngtc;
 +        buf[2] = state_global->nnhpres;
 +        buf[3] = state_global->nhchainlength;
 +        buf[4] = state_global->dfhist.nlambda;
 +    }
 +    dd_bcast(dd,NITEM_DD_INIT_LOCAL_STATE*sizeof(int),buf);
 +
 +    init_state(state_local,0,buf[1],buf[2],buf[3],buf[4]);
 +    state_local->flags = buf[0];
 +    
 +    /* With Langevin Dynamics we need to make proper storage space
 +     * in the global and local state for the random numbers.
 +     */
 +    if (state_local->flags & (1<<estLD_RNG))
 +    {
 +        if (DDMASTER(dd) && state_global->nrngi > 1)
 +        {
 +            state_global->nrng = dd->nnodes*gmx_rng_n();
 +            srenew(state_global->ld_rng,state_global->nrng);
 +        }
 +        state_local->nrng = gmx_rng_n();
 +        snew(state_local->ld_rng,state_local->nrng);
 +    }
 +    if (state_local->flags & (1<<estLD_RNGI))
 +    {
 +        if (DDMASTER(dd) && state_global->nrngi > 1)
 +        {
 +            state_global->nrngi = dd->nnodes;
 +            srenew(state_global->ld_rngi,state_global->nrngi);
 +        }
 +        snew(state_local->ld_rngi,1);
 +    }
 +}
 +
 +static void check_link(t_blocka *link,int cg_gl,int cg_gl_j)
 +{
 +    int  k,aj;
 +    gmx_bool bFound;
 +    
 +    bFound = FALSE;
 +    for(k=link->index[cg_gl]; k<link->index[cg_gl+1]; k++)
 +    {
 +        if (link->a[k] == cg_gl_j)
 +        {
 +            bFound = TRUE;
 +        }
 +    }
 +    if (!bFound)
 +    {
 +        /* Add this charge group link */
 +        if (link->index[cg_gl+1]+1 > link->nalloc_a)
 +        {
 +            link->nalloc_a = over_alloc_large(link->index[cg_gl+1]+1);
 +            srenew(link->a,link->nalloc_a);
 +        }
 +        link->a[link->index[cg_gl+1]] = cg_gl_j;
 +        link->index[cg_gl+1]++;
 +    }
 +}
 +
 +static int *make_at2cg(t_block *cgs)
 +{
 +    int *at2cg,cg,a;
 +    
 +    snew(at2cg,cgs->index[cgs->nr]);
 +    for(cg=0; cg<cgs->nr; cg++)
 +    {
 +        for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
 +        {
 +            at2cg[a] = cg;
 +        }
 +    }
 +    
 +    return at2cg;
 +}
 +
 +t_blocka *make_charge_group_links(gmx_mtop_t *mtop,gmx_domdec_t *dd,
 +                                  cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_reverse_top_t *rt;
 +    int  mb,cg_offset,cg,cg_gl,a,aj,i,j,ftype,nral,nlink_mol,mol,ncgi;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t *molt;
 +    t_block *cgs;
 +    t_blocka *excls;
 +    int *a2c;
 +    gmx_reverse_ilist_t ril;
 +    t_blocka *link;
 +    cginfo_mb_t *cgi_mb;
 +    
 +    /* For each charge group make a list of other charge groups
 +     * in the system that a linked to it via bonded interactions
 +     * which are also stored in reverse_top.
 +     */
 +    
 +    rt = dd->reverse_top;
 +    
 +    snew(link,1);
 +    snew(link->index,ncg_mtop(mtop)+1);
 +    link->nalloc_a = 0;
 +    link->a = NULL;
 +    
 +    link->index[0] = 0;
 +    cg_offset = 0;
 +    ncgi = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        if (molb->nmol == 0)
 +        {
 +            continue;
 +        }
 +        molt = &mtop->moltype[molb->type];
 +        cgs   = &molt->cgs;
 +        excls = &molt->excls;
 +        a2c = make_at2cg(cgs);
 +        /* Make a reverse ilist in which the interactions are linked
 +         * to all atoms, not only the first atom as in gmx_reverse_top.
 +         * The constraints are discarded here.
 +         */
 +        make_reverse_ilist(molt,NULL,FALSE,FALSE,FALSE,TRUE,&ril);
 +
 +        cgi_mb = &cginfo_mb[mb];
 +        
 +        for(cg=0; cg<cgs->nr; cg++)
 +        {
 +            cg_gl = cg_offset + cg;
 +            link->index[cg_gl+1] = link->index[cg_gl];
 +            for(a=cgs->index[cg]; a<cgs->index[cg+1]; a++)
 +            {
 +                i = ril.index[a];
 +                while (i < ril.index[a+1])
 +                {
 +                    ftype = ril.il[i++];
 +                    nral = NRAL(ftype);
 +                    /* Skip the ifunc index */
 +                    i++;
 +                    for(j=0; j<nral; j++)
 +                    {
 +                        aj = ril.il[i+j];
 +                        if (a2c[aj] != cg)
 +                        {
 +                            check_link(link,cg_gl,cg_offset+a2c[aj]);
 +                        }
 +                    }
 +                    i += nral_rt(ftype);
 +                }
 +                if (rt->bExclRequired)
 +                {
 +                    /* Exclusions always go both ways */
 +                    for(j=excls->index[a]; j<excls->index[a+1]; j++)
 +                    {
 +                        aj = excls->a[j];
 +                        if (a2c[aj] != cg)
 +                        {
 +                            check_link(link,cg_gl,cg_offset+a2c[aj]);
 +                        }
 +                    }
 +                }
 +            }
 +            if (link->index[cg_gl+1] - link->index[cg_gl] > 0)
 +            {
 +                SET_CGINFO_BOND_INTER(cgi_mb->cginfo[cg]);
 +                ncgi++;
 +            }
 +        }
 +        nlink_mol = link->index[cg_offset+cgs->nr] - link->index[cg_offset];
 +        
 +        cg_offset += cgs->nr;
 +        
 +        destroy_reverse_ilist(&ril);
 +        sfree(a2c);
 +        
 +        if (debug)
 +        {
 +            fprintf(debug,"molecule type '%s' %d cgs has %d cg links through bonded interac.\n",*molt->name,cgs->nr,nlink_mol);
 +        }
 +        
 +        if (molb->nmol > 1)
 +        {
 +            /* Copy the data for the rest of the molecules in this block */
 +            link->nalloc_a += (molb->nmol - 1)*nlink_mol;
 +            srenew(link->a,link->nalloc_a);
 +            for(mol=1; mol<molb->nmol; mol++)
 +            {
 +                for(cg=0; cg<cgs->nr; cg++)
 +                {
 +                    cg_gl = cg_offset + cg;
 +                    link->index[cg_gl+1] =
 +                        link->index[cg_gl+1-cgs->nr] + nlink_mol;
 +                    for(j=link->index[cg_gl]; j<link->index[cg_gl+1]; j++)
 +                    {
 +                        link->a[j] = link->a[j-nlink_mol] + cgs->nr;
 +                    }
 +                    if (link->index[cg_gl+1] - link->index[cg_gl] > 0 &&
 +                        cg_gl - cgi_mb->cg_start < cgi_mb->cg_mod)
 +                    {
 +                        SET_CGINFO_BOND_INTER(cgi_mb->cginfo[cg_gl - cgi_mb->cg_start]);
 +                        ncgi++;
 +                    }
 +                }
 +                cg_offset += cgs->nr;
 +            }
 +        }
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Of the %d charge groups %d are linked via bonded interactions\n",ncg_mtop(mtop),ncgi);
 +    }
 +    
 +    return link;
 +}
 +
 +static void bonded_cg_distance_mol(gmx_moltype_t *molt,int *at2cg,
 +                                   gmx_bool bBCheck,gmx_bool bExcl,rvec *cg_cm,
 +                                   real *r_2b,int *ft2b,int *a2_1,int *a2_2,
 +                                   real *r_mb,int *ftmb,int *am_1,int *am_2)
 +{
 +    int ftype,nral,i,j,ai,aj,cgi,cgj;
 +    t_ilist *il;
 +    t_blocka *excls;
 +    real r2_2b,r2_mb,rij2;
 +    
 +    r2_2b = 0;
 +    r2_mb = 0;
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        if (dd_check_ftype(ftype,bBCheck,FALSE,FALSE))
 +        {
 +            il = &molt->ilist[ftype];
 +            nral = NRAL(ftype);
 +            if (nral > 1)
 +            {
 +                for(i=0; i<il->nr; i+=1+nral)
 +                {
 +                    for(ai=0; ai<nral; ai++)
 +                    {
 +                        cgi = at2cg[il->iatoms[i+1+ai]];
 +                                              for(aj=0; aj<nral; aj++) {
 +                                                      cgj = at2cg[il->iatoms[i+1+aj]];
 +                                                      if (cgi != cgj)
 +                            {
 +                                                              rij2 = distance2(cg_cm[cgi],cg_cm[cgj]);
 +                                                              if (nral == 2 && rij2 > r2_2b)
 +                                {
 +                                                                      r2_2b = rij2;
 +                                    *ft2b = ftype;
 +                                    *a2_1 = il->iatoms[i+1+ai];
 +                                    *a2_2 = il->iatoms[i+1+aj];
 +                                                              }
 +                                                              if (nral >  2 && rij2 > r2_mb)
 +                                {
 +                                                                      r2_mb = rij2;
 +                                    *ftmb = ftype;
 +                                    *am_1 = il->iatoms[i+1+ai];
 +                                    *am_2 = il->iatoms[i+1+aj];
 +                                                              }
 +                                                      }
 +                                              }
 +                                      }
 +                              }
 +                      }
 +              }
 +      }
 +      if (bExcl)
 +    {
 +              excls = &molt->excls;
 +              for(ai=0; ai<excls->nr; ai++)
 +        {
 +                      cgi = at2cg[ai];
 +                      for(j=excls->index[ai]; j<excls->index[ai+1]; j++) {
 +                              cgj = at2cg[excls->a[j]];
 +                              if (cgi != cgj)
 +                {
 +                                      rij2 = distance2(cg_cm[cgi],cg_cm[cgj]);
 +                                      if (rij2 > r2_2b)
 +                    {
 +                                              r2_2b = rij2;
 +                                      }
 +                              }
 +                      }
 +              }
 +      }
 +      
 +      *r_2b = sqrt(r2_2b);
 +      *r_mb = sqrt(r2_mb);
 +}
 +
 +static void get_cgcm_mol(gmx_moltype_t *molt,gmx_ffparams_t *ffparams,
 +                         int ePBC,t_graph *graph,matrix box,
 +                         gmx_vsite_t *vsite,
 +                         rvec *x,rvec *xs,rvec *cg_cm)
 +{
 +    int n,i;
 +
 +    if (ePBC != epbcNONE)
 +    {
 +        mk_mshift(NULL,graph,ePBC,box,x);
 +        
 +        shift_x(graph,box,x,xs);
 +        /* By doing an extra mk_mshift the molecules that are broken
 +         * because they were e.g. imported from another software
 +         * will be made whole again. Such are the healing powers
 +         * of GROMACS.
 +         */  
 +        mk_mshift(NULL,graph,ePBC,box,xs);
 +    }
 +    else
 +    {
 +        /* We copy the coordinates so the original coordinates remain
 +         * unchanged, just to be 100% sure that we do not affect
 +         * binary reproducibility of simulations.
 +         */
 +        n = molt->cgs.index[molt->cgs.nr];
 +        for(i=0; i<n; i++)
 +        {
 +            copy_rvec(x[i],xs[i]);
 +        }
 +    }
 +    
 +    if (vsite)
 +    {
 +        construct_vsites(NULL,vsite,xs,NULL,0.0,NULL,
 +                         ffparams->iparams,molt->ilist,
 +                         epbcNONE,TRUE,NULL,NULL,NULL);
 +    }
 +    
 +    calc_cgcm(NULL,0,molt->cgs.nr,&molt->cgs,xs,cg_cm);
 +}
 +
 +static int have_vsite_molt(gmx_moltype_t *molt)
 +{
 +    int  i;
 +    gmx_bool bVSite;
 +    
 +    bVSite = FALSE;
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        if ((interaction_function[i].flags & IF_VSITE) &&
 +            molt->ilist[i].nr > 0) {
 +            bVSite = TRUE;
 +        }
 +    }
 +    
 +    return bVSite;
 +}
 +
 +void dd_bonded_cg_distance(FILE *fplog,
 +                           gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                           t_inputrec *ir,rvec *x,matrix box,
 +                           gmx_bool bBCheck,
 +                           real *r_2b,real *r_mb)
 +{
 +    gmx_bool bExclRequired;
 +    int  mb,cg_offset,at_offset,*at2cg,mol;
 +    t_graph graph;
 +    gmx_vsite_t *vsite;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t *molt;
 +    rvec *xs,*cg_cm;
 +    real rmol_2b,rmol_mb;
 +    int ft2b=-1,a_2b_1=-1,a_2b_2=-1,ftmb=-1,a_mb_1=-1,a_mb_2=-1;
 +    int ftm2b=-1,amol_2b_1=-1,amol_2b_2=-1,ftmmb=-1,amol_mb_1=-1,amol_mb_2=-1;
 +    
 +    bExclRequired = IR_EXCL_FORCES(*ir);
 +    
-     
++    vsite = init_vsite(mtop,NULL,TRUE);
 +    
 +    *r_2b = 0;
 +    *r_mb = 0;
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        if (molt->cgs.nr == 1 || molb->nmol == 0)
 +        {
 +            cg_offset += molb->nmol*molt->cgs.nr;
 +            at_offset += molb->nmol*molt->atoms.nr;
 +        }
 +        else
 +        {
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                mk_graph_ilist(NULL,molt->ilist,0,molt->atoms.nr,FALSE,FALSE,
 +                               &graph);
 +            }
 +            
 +            at2cg = make_at2cg(&molt->cgs);
 +            snew(xs,molt->atoms.nr);
 +            snew(cg_cm,molt->cgs.nr);
 +            for(mol=0; mol<molb->nmol; mol++)
 +            {
 +                get_cgcm_mol(molt,&mtop->ffparams,ir->ePBC,&graph,box,
 +                             have_vsite_molt(molt) ? vsite : NULL,
 +                             x+at_offset,xs,cg_cm);
 +                
 +                bonded_cg_distance_mol(molt,at2cg,bBCheck,bExclRequired,cg_cm,
 +                                       &rmol_2b,&ftm2b,&amol_2b_1,&amol_2b_2,
 +                                       &rmol_mb,&ftmmb,&amol_mb_1,&amol_mb_2);
 +                if (rmol_2b > *r_2b)
 +                {
 +                    *r_2b  = rmol_2b;
 +                    ft2b   = ftm2b;
 +                    a_2b_1 = at_offset + amol_2b_1;
 +                    a_2b_2 = at_offset + amol_2b_2;
 +                }
 +                if (rmol_mb > *r_mb)
 +                {
 +                    *r_mb  = rmol_mb;
 +                    ftmb   = ftmmb;
 +                    a_mb_1 = at_offset + amol_mb_1;
 +                    a_mb_2 = at_offset + amol_mb_2;
 +                }
 +                
 +                cg_offset += molt->cgs.nr;
 +                at_offset += molt->atoms.nr;
 +            }
 +            sfree(cg_cm);
 +            sfree(xs);
 +            sfree(at2cg);
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                done_graph(&graph);
 +            }
 +        }
 +    }
++
++    /* We should have a vsite free routine, but here we can simply free */
 +    sfree(vsite);
 +
 +    if (fplog && (ft2b >= 0 || ftmb >= 0))
 +    {
 +        fprintf(fplog,
 +                "Initial maximum inter charge-group distances:\n");
 +        if (ft2b >= 0)
 +        {
 +            fprintf(fplog,
 +                    "    two-body bonded interactions: %5.3f nm, %s, atoms %d %d\n",
 +                    *r_2b,interaction_function[ft2b].longname,
 +                    a_2b_1+1,a_2b_2+1);
 +        }
 +        if (ftmb >= 0)
 +        {
 +            fprintf(fplog,
 +                    "  multi-body bonded interactions: %5.3f nm, %s, atoms %d %d\n",
 +                    *r_mb,interaction_function[ftmb].longname,
 +                    a_mb_1+1,a_mb_2+1);
 +        }
 +    }
 +}
index 27e06cb0c79fa9bf4347f04bca30f07d081baca7,0000000000000000000000000000000000000000..2ca0c7766063202eb7bfbc232f3e13e53c193bc2
mode 100644,000000..100644
--- /dev/null
@@@ -1,927 -1,0 +1,922 @@@
-             else
-             {
-                 /* Energies and virial are obtained later from the PME nodes */
-                 /* but values have to be zeroed out here */
-                 Vlr=0.0;
-             }
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "nrnb.h"
 +#include "bondf.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "pme.h"
 +#include "mdrun.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "gmx_omp_nthreads.h"
 +
 +
 +void ns(FILE *fp,
 +        t_forcerec *fr,
 +        rvec       x[],
 +        matrix     box,
 +        gmx_groups_t *groups,
 +        t_grpopts  *opts,
 +        gmx_localtop_t *top,
 +        t_mdatoms  *md,
 +        t_commrec  *cr,
 +        t_nrnb     *nrnb,
 +        real       *lambda,
 +        real       *dvdlambda,
 +        gmx_grppairener_t *grppener,
 +        gmx_bool       bFillGrid,
 +        gmx_bool       bDoLongRange,
 +        gmx_bool       bDoForces,
 +        rvec       *f)
 +{
 +  char   *ptr;
 +  int    nsearch;
 +
 +
 +  if (!fr->ns.nblist_initialized)
 +  {
 +      init_neighbor_list(fp, fr, md->homenr);
 +  }
 +
 +  if (fr->bTwinRange)
 +    fr->nlr=0;
 +
 +    nsearch = search_neighbours(fp,fr,x,box,top,groups,cr,nrnb,md,
 +                                lambda,dvdlambda,grppener,
 +                                bFillGrid,bDoLongRange,
 +                                bDoForces,f);
 +  if (debug)
 +    fprintf(debug,"nsearch = %d\n",nsearch);
 +
 +  /* Check whether we have to do dynamic load balancing */
 +  /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
 +    count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
 +    &(top->idef),opts->ngener);
 +  */
 +  if (fr->ns.dump_nl > 0)
 +    dump_nblist(fp,cr,fr,fr->ns.dump_nl);
 +}
 +
 +static void reduce_thread_forces(int n,rvec *f,
 +                                 tensor vir,
 +                                 real *Vcorr,
 +                                 int efpt_ind,real *dvdl,
 +                                 int nthreads,f_thread_t *f_t)
 +{
 +    int t,i;
 +
 +    /* This reduction can run over any number of threads */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntBonded)) private(t) schedule(static)
 +    for(i=0; i<n; i++)
 +    {
 +        for(t=1; t<nthreads; t++)
 +        {
 +            rvec_inc(f[i],f_t[t].f[i]);
 +        }
 +    }
 +    for(t=1; t<nthreads; t++)
 +    {
 +        *Vcorr += f_t[t].Vcorr;
 +        *dvdl  += f_t[t].dvdl[efpt_ind];
 +        m_add(vir,f_t[t].vir,vir);
 +    }
 +}
 +
 +void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
 +                       t_forcerec *fr,      t_inputrec *ir,
 +                       t_idef     *idef,    t_commrec  *cr,
 +                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
 +                       t_mdatoms  *md,
 +                       t_grpopts  *opts,
 +                       rvec       x[],      history_t  *hist,
 +                       rvec       f[],
 +                       gmx_enerdata_t *enerd,
 +                       t_fcdata   *fcd,
 +                       gmx_mtop_t     *mtop,
 +                       gmx_localtop_t *top,
 +                       gmx_genborn_t *born,
 +                       t_atomtypes *atype,
 +                       gmx_bool       bBornRadii,
 +                       matrix     box,
 +                       t_lambda   *fepvals,
 +                       real       *lambda,
 +                       t_graph    *graph,
 +                       t_blocka   *excl,
 +                       rvec       mu_tot[],
 +                       int        flags,
 +                       float      *cycles_pme)
 +{
 +    int     i,j,status;
 +    int     donb_flags;
 +    gmx_bool    bDoEpot,bSepDVDL,bSB;
 +    int     pme_flags;
 +    matrix  boxs;
 +    rvec    box_size;
 +    real    Vsr,Vlr,Vcorr=0;
 +    t_pbc   pbc;
 +    real    dvdgb;
 +    char    buf[22];
 +    gmx_enerdata_t ed_lam;
 +    double  clam_i,vlam_i;
 +    real    dvdl_dum[efptNR], dvdl, dvdl_nb[efptNR], lam_i[efptNR];
 +    real    dvdlsum;
 +
 +#ifdef GMX_MPI
 +    double  t0=0.0,t1,t2,t3; /* time measurement for coarse load balancing */
 +#endif
 +
 +#define PRINT_SEPDVDL(s,v,dvdlambda) if (bSepDVDL) fprintf(fplog,sepdvdlformat,s,v,dvdlambda);
 +
 +
 +    set_pbc(&pbc,fr->ePBC,box);
 +
 +    /* reset free energy components */
 +    for (i=0;i<efptNR;i++)
 +    {
 +        dvdl_nb[i]  = 0;
 +        dvdl_dum[i] = 0;
 +    }
 +
 +    /* Reset box */
 +    for(i=0; (i<DIM); i++)
 +    {
 +        box_size[i]=box[i][i];
 +    }
 +
 +    bSepDVDL=(fr->bSepDVDL && do_per_step(step,ir->nstlog));
 +    debug_gmx();
 +
 +    /* do QMMM first if requested */
 +    if(fr->bQMMM)
 +    {
 +        enerd->term[F_EQM] = calculate_QMMM(cr,x,f,fr,md);
 +    }
 +
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,"Step %s: non-bonded V and dVdl for node %d:\n",
 +                gmx_step_str(step,buf),cr->nodeid);
 +    }
 +
 +    /* Call the short range functions all in one go. */
 +
 +#ifdef GMX_MPI
 +    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
 +#define TAKETIME FALSE
 +    if (TAKETIME)
 +    {
 +        MPI_Barrier(cr->mpi_comm_mygroup);
 +        t0=MPI_Wtime();
 +    }
 +#endif
 +
 +    if (ir->nwall)
 +    {
 +        /* foreign lambda component for walls */
 +        dvdl = do_walls(ir,fr,box,md,x,f,lambda[efptVDW],
 +                        enerd->grpp.ener[egLJSR],nrnb);
 +        PRINT_SEPDVDL("Walls",0.0,dvdl);
 +        enerd->dvdl_lin[efptVDW] += dvdl;
 +    }
 +
 +      /* If doing GB, reset dvda and calculate the Born radii */
 +      if (ir->implicit_solvent)
 +      {
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +
 +              for(i=0;i<born->nr;i++)
 +              {
 +                      fr->dvda[i]=0;
 +              }
 +
 +              if(bBornRadii)
 +              {
 +                      calc_gb_rad(cr,fr,ir,top,atype,x,&(fr->gblist),born,md,nrnb);
 +              }
 +
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +      }
 +
 +    where();
 +    if (flags & GMX_FORCE_NONBONDED)
 +    {
 +        donb_flags = 0;
 +        if (flags & GMX_FORCE_FORCES)
 +        {
 +            donb_flags |= GMX_DONB_FORCES;
 +        }
 +
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +        do_nonbonded(cr,fr,x,f,md,excl,
 +                    fr->bBHAM ?
 +                    enerd->grpp.ener[egBHAMSR] :
 +                    enerd->grpp.ener[egLJSR],
 +                    enerd->grpp.ener[egCOULSR],
 +                    enerd->grpp.ener[egGB],box_size,nrnb,
 +                    lambda,dvdl_nb,-1,-1,donb_flags);
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +    }
 +
 +    /* If we do foreign lambda and we have soft-core interactions
 +     * we have to recalculate the (non-linear) energies contributions.
 +     */
 +    if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +        init_enerdata(mtop->groups.grps[egcENER].nr,fepvals->n_lambda,&ed_lam);
 +
 +        for(i=0; i<enerd->n_lambda; i++)
 +        {
 +            for (j=0;j<efptNR;j++)
 +            {
 +                lam_i[j] = (i==0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 +            }
 +            reset_enerdata(&ir->opts,fr,TRUE,&ed_lam,FALSE);
 +            do_nonbonded(cr,fr,x,f,md,excl,
 +                         fr->bBHAM ?
 +                         ed_lam.grpp.ener[egBHAMSR] :
 +                         ed_lam.grpp.ener[egLJSR],
 +                         ed_lam.grpp.ener[egCOULSR],
 +                         enerd->grpp.ener[egGB], box_size,nrnb,
 +                         lam_i,dvdl_dum,-1,-1,
 +                         GMX_DONB_FOREIGNLAMBDA);
 +            sum_epot(&ir->opts,&ed_lam);
 +            enerd->enerpart_lambda[i] += ed_lam.term[F_EPOT];
 +        }
 +        destroy_enerdata(&ed_lam);
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +    }
 +    where();
 +
 +      /* If we are doing GB, calculate bonded forces and apply corrections
 +       * to the solvation forces */
 +    /* MRS: Eventually, many need to include free energy contribution here! */
 +      if (ir->implicit_solvent)
 +    {
 +              calc_gb_forces(cr,md,born,top,atype,x,f,fr,idef,
 +                       ir->gb_algorithm,ir->sa_algorithm,nrnb,bBornRadii,&pbc,graph,enerd);
 +        wallcycle_sub_stop(wcycle, ewcsBONDED);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (TAKETIME)
 +    {
 +        t1=MPI_Wtime();
 +        fr->t_fnbf += t1-t0;
 +    }
 +#endif
 +
 +    if (fepvals->sc_alpha!=0)
 +    {
 +        enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
 +    }
 +    else
 +    {
 +        enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
 +    }
 +
 +    if (fepvals->sc_alpha!=0)
 +
 +        /* even though coulomb part is linear, we already added it, beacuse we
 +           need to go through the vdw calculation anyway */
 +    {
 +        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
 +    }
 +    else
 +    {
 +        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
 +    }
 +
 +    Vsr = 0;
 +    if (bSepDVDL)
 +    {
 +        for(i=0; i<enerd->grpp.nener; i++)
 +        {
 +            Vsr +=
 +                (fr->bBHAM ?
 +                 enerd->grpp.ener[egBHAMSR][i] :
 +                 enerd->grpp.ener[egLJSR][i])
 +                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
 +        }
 +        dvdlsum = dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
 +        PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",Vsr,dvdlsum);
 +    }
 +    debug_gmx();
 +
 +
 +    if (debug)
 +    {
 +        pr_rvecs(debug,0,"fshift after SR",fr->fshift,SHIFTS);
 +    }
 +
 +    /* Shift the coordinates. Must be done before bonded forces and PPPM,
 +     * but is also necessary for SHAKE and update, therefore it can NOT
 +     * go when no bonded forces have to be evaluated.
 +     */
 +
 +    /* Here sometimes we would not need to shift with NBFonly,
 +     * but we do so anyhow for consistency of the returned coordinates.
 +     */
 +    if (graph)
 +    {
 +        shift_self(graph,box,x);
 +        if (TRICLINIC(box))
 +        {
 +            inc_nrnb(nrnb,eNR_SHIFTX,2*graph->nnodes);
 +        }
 +        else
 +        {
 +            inc_nrnb(nrnb,eNR_SHIFTX,graph->nnodes);
 +        }
 +    }
 +    /* Check whether we need to do bondeds or correct for exclusions */
 +    if (fr->bMolPBC &&
 +        ((flags & GMX_FORCE_BONDED)
 +         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype)))
 +    {
 +        /* Since all atoms are in the rectangular or triclinic unit-cell,
 +         * only single box vector shifts (2 in x) are required.
 +         */
 +        set_pbc_dd(&pbc,fr->ePBC,cr->dd,TRUE,box);
 +    }
 +    debug_gmx();
 +
 +    if (flags & GMX_FORCE_BONDED)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsBONDED);
 +        calc_bonds(fplog,cr->ms,
 +                   idef,x,hist,f,fr,&pbc,graph,enerd,nrnb,lambda,md,fcd,
 +                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
 +                   flags,
 +                   fr->bSepDVDL && do_per_step(step,ir->nstlog),step);
 +
 +        /* Check if we have to determine energy differences
 +         * at foreign lambda's.
 +         */
 +        if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) &&
 +            idef->ilsort != ilsortNO_FE)
 +        {
 +            if (idef->ilsort != ilsortFE_SORTED)
 +            {
 +                gmx_incons("The bonded interactions are not sorted for free energy");
 +            }
 +            init_enerdata(mtop->groups.grps[egcENER].nr,fepvals->n_lambda,&ed_lam);
 +
 +            for(i=0; i<enerd->n_lambda; i++)
 +            {
 +                reset_enerdata(&ir->opts,fr,TRUE,&ed_lam,FALSE);
 +                for (j=0;j<efptNR;j++)
 +                {
 +                    lam_i[j] = (i==0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
 +                }
 +                calc_bonds_lambda(fplog,idef,x,fr,&pbc,graph,&ed_lam,nrnb,lam_i,md,
 +                                  fcd,DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
 +                sum_epot(&ir->opts,&ed_lam);
 +                enerd->enerpart_lambda[i] += ed_lam.term[F_EPOT];
 +            }
 +            destroy_enerdata(&ed_lam);
 +        }
 +        debug_gmx();
 +
 +        wallcycle_sub_stop(wcycle, ewcsBONDED);
 +    }
 +
 +    where();
 +
 +    *cycles_pme = 0;
 +    if (EEL_FULL(fr->eeltype))
 +    {
 +        bSB = (ir->nwall == 2);
 +        if (bSB)
 +        {
 +            copy_mat(box,boxs);
 +            svmul(ir->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +            box_size[ZZ] *= ir->wall_ewald_zfac;
 +        }
 +
 +        clear_mat(fr->vir_el_recip);
 +
 +        if (fr->bEwald)
 +        {
 +            Vcorr = 0;
 +            dvdl  = 0;
 +
 +            /* With the Verlet scheme exclusion forces are calculated
 +             * in the non-bonded kernel.
 +             */
 +            /* The TPI molecule does not have exclusions with the rest
 +             * of the system and no intra-molecular PME grid contributions
 +             * will be calculated in gmx_pme_calc_energy.
 +             */
 +            if ((ir->cutoff_scheme == ecutsGROUP && fr->n_tpi == 0) ||
 +                ir->ewald_geometry != eewg3D ||
 +                ir->epsilon_surface != 0)
 +            {
 +                int nthreads,t;
 +
 +                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
 +
 +                if (fr->n_tpi > 0)
 +                {
 +                    gmx_fatal(FARGS,"TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
 +                }
 +
 +                nthreads = gmx_omp_nthreads_get(emntBonded);
 +#pragma omp parallel for num_threads(nthreads) schedule(static)
 +                for(t=0; t<nthreads; t++)
 +                {
 +                    int s,e,i;
 +                    rvec *fnv;
 +                    tensor *vir;
 +                    real *Vcorrt,*dvdlt;
 +                    if (t == 0)
 +                    {
 +                        fnv    = fr->f_novirsum;
 +                        vir    = &fr->vir_el_recip;
 +                        Vcorrt = &Vcorr;
 +                        dvdlt  = &dvdl;
 +                    }
 +                    else
 +                    {
 +                        fnv    = fr->f_t[t].f;
 +                        vir    = &fr->f_t[t].vir;
 +                        Vcorrt = &fr->f_t[t].Vcorr;
 +                        dvdlt  = &fr->f_t[t].dvdl[efptCOUL];
 +                        for(i=0; i<fr->natoms_force; i++)
 +                        {
 +                            clear_rvec(fnv[i]);
 +                        }
 +                        clear_mat(*vir);
 +                    }
 +                    *dvdlt = 0;
 +                    *Vcorrt =
 +                        ewald_LRcorrection(fplog,
 +                                           fr->excl_load[t],fr->excl_load[t+1],
 +                                           cr,t,fr,
 +                                           md->chargeA,
 +                                           md->nChargePerturbed ? md->chargeB : NULL,
 +                                           ir->cutoff_scheme != ecutsVERLET,
 +                                           excl,x,bSB ? boxs : box,mu_tot,
 +                                           ir->ewald_geometry,
 +                                           ir->epsilon_surface,
 +                                           fnv,*vir,
 +                                           lambda[efptCOUL],dvdlt);
 +                }
 +                if (nthreads > 1)
 +                {
 +                    reduce_thread_forces(fr->natoms_force,fr->f_novirsum,
 +                                         fr->vir_el_recip,
 +                                         &Vcorr,efptCOUL,&dvdl,
 +                                         nthreads,fr->f_t);
 +                }
 +
 +                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
 +            }
 +
 +            if (fr->n_tpi == 0)
 +            {
 +                Vcorr += ewald_charge_correction(cr,fr,lambda[efptCOUL],box,
 +                                                 &dvdl,fr->vir_el_recip);
 +            }
 +
 +            PRINT_SEPDVDL("Ewald excl./charge/dip. corr.",Vcorr,dvdl);
 +            enerd->dvdl_lin[efptCOUL] += dvdl;
 +        }
 +
 +        status = 0;
++        Vlr  = 0;
 +        dvdl = 0;
 +        switch (fr->eeltype)
 +        {
 +        case eelPME:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
 +        case eelP3M_AD:
 +            if (cr->duty & DUTY_PME)
 +            {
 +                assert(fr->n_tpi >= 0);
 +                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
 +                {
 +                    pme_flags = GMX_PME_SPREAD_Q | GMX_PME_SOLVE;
 +                    if (flags & GMX_FORCE_FORCES)
 +                    {
 +                        pme_flags |= GMX_PME_CALC_F;
 +                    }
 +                    if (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY))
 +                    {
 +                        pme_flags |= GMX_PME_CALC_ENER_VIR;
 +                    }
 +                    if (fr->n_tpi > 0)
 +                    {
 +                        /* We don't calculate f, but we do want the potential */
 +                        pme_flags |= GMX_PME_CALC_POT;
 +                    }
 +                    wallcycle_start(wcycle,ewcPMEMESH);
 +                    status = gmx_pme_do(fr->pmedata,
 +                                        md->start,md->homenr - fr->n_tpi,
 +                                        x,fr->f_novirsum,
 +                                        md->chargeA,md->chargeB,
 +                                        bSB ? boxs : box,cr,
 +                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
 +                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
 +                                        nrnb,wcycle,
 +                                        fr->vir_el_recip,fr->ewaldcoeff,
 +                                        &Vlr,lambda[efptCOUL],&dvdl,
 +                                        pme_flags);
 +                    *cycles_pme = wallcycle_stop(wcycle,ewcPMEMESH);
 +
 +                    /* We should try to do as little computation after
 +                     * this as possible, because parallel PME synchronizes
 +                     * the nodes, so we want all load imbalance of the rest
 +                     * of the force calculation to be before the PME call.
 +                     * DD load balancing is done on the whole time of
 +                     * the force call (without PME).
 +                     */
 +                }
 +                if (fr->n_tpi > 0)
 +                {
 +                    /* Determine the PME grid energy of the test molecule
 +                     * with the PME grid potential of the other charges.
 +                     */
 +                    gmx_pme_calc_energy(fr->pmedata,fr->n_tpi,
 +                                        x + md->homenr - fr->n_tpi,
 +                                        md->chargeA + md->homenr - fr->n_tpi,
 +                                        &Vlr);
 +                }
 +                PRINT_SEPDVDL("PME mesh",Vlr,dvdl);
 +            }
-             Vlr = 0;
 +            break;
 +        case eelEWALD:
 +            Vlr = do_ewald(fplog,FALSE,ir,x,fr->f_novirsum,
 +                           md->chargeA,md->chargeB,
 +                           box_size,cr,md->homenr,
 +                           fr->vir_el_recip,fr->ewaldcoeff,
 +                           lambda[efptCOUL],&dvdl,fr->ewald_table);
 +            PRINT_SEPDVDL("Ewald long-range",Vlr,dvdl);
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"No such electrostatics method implemented %s",
 +                      eel_names[fr->eeltype]);
 +        }
 +        if (status != 0)
 +        {
 +            gmx_fatal(FARGS,"Error %d in long range electrostatics routine %s",
 +                      status,EELTYPE(fr->eeltype));
 +              }
++        /* Note that with separate PME nodes we get the real energies later */
 +        enerd->dvdl_lin[efptCOUL] += dvdl;
 +        enerd->term[F_COUL_RECIP] = Vlr + Vcorr;
 +        if (debug)
 +        {
 +            fprintf(debug,"Vlr = %g, Vcorr = %g, Vlr_corr = %g\n",
 +                    Vlr,Vcorr,enerd->term[F_COUL_RECIP]);
 +            pr_rvecs(debug,0,"vir_el_recip after corr",fr->vir_el_recip,DIM);
 +            pr_rvecs(debug,0,"fshift after LR Corrections",fr->fshift,SHIFTS);
 +        }
 +    }
 +    else
 +    {
 +        if (EEL_RF(fr->eeltype))
 +        {
 +            /* With the Verlet scheme exclusion forces are calculated
 +             * in the non-bonded kernel.
 +             */
 +            if (ir->cutoff_scheme != ecutsVERLET && fr->eeltype != eelRF_NEC)
 +            {
 +                dvdl = 0;
 +                enerd->term[F_RF_EXCL] =
 +                    RF_excl_correction(fplog,fr,graph,md,excl,x,f,
 +                                       fr->fshift,&pbc,lambda[efptCOUL],&dvdl);
 +            }
 +
 +            enerd->dvdl_lin[efptCOUL] += dvdl;
 +            PRINT_SEPDVDL("RF exclusion correction",
 +                          enerd->term[F_RF_EXCL],dvdl);
 +        }
 +    }
 +    where();
 +    debug_gmx();
 +
 +    if (debug)
 +    {
 +        print_nrnb(debug,nrnb);
 +    }
 +    debug_gmx();
 +
 +#ifdef GMX_MPI
 +    if (TAKETIME)
 +    {
 +        t2=MPI_Wtime();
 +        MPI_Barrier(cr->mpi_comm_mygroup);
 +        t3=MPI_Wtime();
 +        fr->t_wait += t3-t2;
 +        if (fr->timesteps == 11)
 +        {
 +            fprintf(stderr,"* PP load balancing info: node %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n",
 +                    cr->nodeid, gmx_step_str(fr->timesteps,buf),
 +                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf),
 +                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
 +        }
 +        fr->timesteps++;
 +    }
 +#endif
 +
 +    if (debug)
 +    {
 +        pr_rvecs(debug,0,"fshift after bondeds",fr->fshift,SHIFTS);
 +    }
 +
 +}
 +
 +void init_enerdata(int ngener,int n_lambda,gmx_enerdata_t *enerd)
 +{
 +    int i,n2;
 +
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        enerd->term[i] = 0;
 +    }
 +
 +
 +    for(i=0; i<efptNR; i++) {
 +        enerd->dvdl_lin[i]  = 0;
 +        enerd->dvdl_nonlin[i]  = 0;
 +    }
 +
 +    n2=ngener*ngener;
 +    if (debug)
 +    {
 +        fprintf(debug,"Creating %d sized group matrix for energies\n",n2);
 +    }
 +    enerd->grpp.nener = n2;
 +    for(i=0; (i<egNR); i++)
 +    {
 +        snew(enerd->grpp.ener[i],n2);
 +    }
 +
 +    if (n_lambda)
 +    {
 +        enerd->n_lambda = 1 + n_lambda;
 +        snew(enerd->enerpart_lambda,enerd->n_lambda);
 +    }
 +    else
 +    {
 +        enerd->n_lambda = 0;
 +    }
 +}
 +
 +void destroy_enerdata(gmx_enerdata_t *enerd)
 +{
 +    int i;
 +
 +    for(i=0; (i<egNR); i++)
 +    {
 +        sfree(enerd->grpp.ener[i]);
 +    }
 +
 +    if (enerd->n_lambda)
 +    {
 +        sfree(enerd->enerpart_lambda);
 +    }
 +}
 +
 +static real sum_v(int n,real v[])
 +{
 +  real t;
 +  int  i;
 +
 +  t = 0.0;
 +  for(i=0; (i<n); i++)
 +    t = t + v[i];
 +
 +  return t;
 +}
 +
 +void sum_epot(t_grpopts *opts,gmx_enerdata_t *enerd)
 +{
 +  gmx_grppairener_t *grpp;
 +  real *epot;
 +  int i;
 +
 +  grpp = &enerd->grpp;
 +  epot = enerd->term;
 +
 +  /* Accumulate energies */
 +  epot[F_COUL_SR]  = sum_v(grpp->nener,grpp->ener[egCOULSR]);
 +  epot[F_LJ]       = sum_v(grpp->nener,grpp->ener[egLJSR]);
 +  epot[F_LJ14]     = sum_v(grpp->nener,grpp->ener[egLJ14]);
 +  epot[F_COUL14]   = sum_v(grpp->nener,grpp->ener[egCOUL14]);
 +  epot[F_COUL_LR]  = sum_v(grpp->nener,grpp->ener[egCOULLR]);
 +  epot[F_LJ_LR]    = sum_v(grpp->nener,grpp->ener[egLJLR]);
 +  /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
 +  epot[F_GBPOL]   += sum_v(grpp->nener,grpp->ener[egGB]);
 +
 +/* lattice part of LR doesnt belong to any group
 + * and has been added earlier
 + */
 +  epot[F_BHAM]     = sum_v(grpp->nener,grpp->ener[egBHAMSR]);
 +  epot[F_BHAM_LR]  = sum_v(grpp->nener,grpp->ener[egBHAMLR]);
 +
 +  epot[F_EPOT] = 0;
 +  for(i=0; (i<F_EPOT); i++)
 +  {
 +      if (i != F_DISRESVIOL && i != F_ORIRESDEV)
 +      {
 +          epot[F_EPOT] += epot[i];
 +      }
 +  }
 +}
 +
 +void sum_dhdl(gmx_enerdata_t *enerd, real *lambda, t_lambda *fepvals)
 +{
 +    int i,j,index;
 +    double dlam;
 +
 +    enerd->dvdl_lin[efptVDW] += enerd->term[F_DVDL_VDW];  /* include dispersion correction */
 +    enerd->term[F_DVDL] = 0.0;
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (fepvals->separate_dvdl[i])
 +        {
 +            /* could this be done more readably/compactly? */
 +            switch (i) {
 +            case (efptCOUL):
 +                index = F_DVDL_COUL;
 +                break;
 +            case (efptVDW):
 +                index = F_DVDL_VDW;
 +                break;
 +            case (efptBONDED):
 +                index = F_DVDL_BONDED;
 +                break;
 +            case (efptRESTRAINT):
 +                index = F_DVDL_RESTRAINT;
 +                break;
 +            case (efptMASS):
 +                index = F_DKDL;
 +                break;
 +            default:
 +                index = F_DVDL;
 +                break;
 +            }
 +            enerd->term[index] = enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
 +            if (debug)
 +            {
 +                fprintf(debug,"dvdl-%s[%2d]: %f: non-linear %f + linear %f\n",
 +                        efpt_names[i],i,enerd->term[index],enerd->dvdl_nonlin[i],enerd->dvdl_lin[i]);
 +            }
 +        }
 +        else
 +        {
 +            enerd->term[F_DVDL] += enerd->dvdl_lin[i] + enerd->dvdl_nonlin[i];
 +            if (debug)
 +            {
 +                fprintf(debug,"dvd-%sl[%2d]: %f: non-linear %f + linear %f\n",
 +                        efpt_names[0],i,enerd->term[F_DVDL],enerd->dvdl_nonlin[i],enerd->dvdl_lin[i]);
 +            }
 +        }
 +    }
 +
 +    /* Notes on the foreign lambda free energy difference evaluation:
 +     * Adding the potential and ekin terms that depend linearly on lambda
 +     * as delta lam * dvdl to the energy differences is exact.
 +     * For the constraints this is not exact, but we have no other option
 +     * without literally changing the lengths and reevaluating the energies at each step.
 +     * (try to remedy this post 4.6 - MRS)
 +     * For the non-bonded LR term we assume that the soft-core (if present)
 +     * no longer affects the energy beyond the short-range cut-off,
 +     * which is a very good approximation (except for exotic settings).
 +     * (investigate how to overcome this post 4.6 - MRS)
 +     */
 +
 +    for(i=0; i<fepvals->n_lambda; i++)
 +    {                                         /* note we are iterating over fepvals here!
 +                                                 For the current lam, dlam = 0 automatically,
 +                                                 so we don't need to add anything to the
 +                                                 enerd->enerpart_lambda[0] */
 +
 +        /* we don't need to worry about dvdl contributions to the current lambda, because
 +           it's automatically zero */
 +
 +        /* first kinetic energy term */
 +        dlam = (fepvals->all_lambda[efptMASS][i] - lambda[efptMASS]);
 +
 +        enerd->enerpart_lambda[i+1] += enerd->term[F_DKDL]*dlam;
 +
 +        for (j=0;j<efptNR;j++)
 +        {
 +            if (j==efptMASS) {continue;} /* no other mass term to worry about */
 +
 +            dlam = (fepvals->all_lambda[j][i]-lambda[j]);
 +            enerd->enerpart_lambda[i+1] += dlam*enerd->dvdl_lin[j];
 +            if (debug)
 +            {
 +                fprintf(debug,"enerdiff lam %g: (%15s), non-linear %f linear %f*%f\n",
 +                        fepvals->all_lambda[j][i],efpt_names[j],
 +                        (enerd->enerpart_lambda[i+1] - enerd->enerpart_lambda[0]),
 +                        dlam,enerd->dvdl_lin[j]);
 +            }
 +        }
 +    }
 +}
 +
 +void reset_enerdata(t_grpopts *opts,
 +                    t_forcerec *fr,gmx_bool bNS,
 +                    gmx_enerdata_t *enerd,
 +                    gmx_bool bMaster)
 +{
 +    gmx_bool bKeepLR;
 +    int  i,j;
 +
 +    /* First reset all energy components, except for the long range terms
 +     * on the master at non neighbor search steps, since the long range
 +     * terms have already been summed at the last neighbor search step.
 +     */
 +    bKeepLR = (fr->bTwinRange && !bNS);
 +    for(i=0; (i<egNR); i++) {
 +        if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR))) {
 +            for(j=0; (j<enerd->grpp.nener); j++)
 +                enerd->grpp.ener[i][j] = 0.0;
 +        }
 +    }
 +    for (i=0;i<efptNR;i++)
 +    {
 +        enerd->dvdl_lin[i]    = 0.0;
 +        enerd->dvdl_nonlin[i] = 0.0;
 +    }
 +
 +    /* Normal potential energy components */
 +    for(i=0; (i<=F_EPOT); i++) {
 +        enerd->term[i] = 0.0;
 +    }
 +    /* Initialize the dVdlambda term with the long range contribution */
 +    /* Initialize the dvdl term with the long range contribution */
 +    enerd->term[F_DVDL]            = 0.0;
 +    enerd->term[F_DVDL_COUL]       = 0.0;
 +    enerd->term[F_DVDL_VDW]        = 0.0;
 +    enerd->term[F_DVDL_BONDED]     = 0.0;
 +    enerd->term[F_DVDL_RESTRAINT]  = 0.0;
 +    enerd->term[F_DKDL]            = 0.0;
 +    if (enerd->n_lambda > 0)
 +    {
 +        for(i=0; i<enerd->n_lambda; i++)
 +        {
 +            enerd->enerpart_lambda[i] = 0.0;
 +        }
 +    }
 +}
index 83f1d6b6d5374f81cd8116c4524d84c2dcf0c461,0000000000000000000000000000000000000000..2dfe5f9ea4cf39ecec6c181f6dbdfc126f2faeb6
mode 100644,000000..100644
--- /dev/null
@@@ -1,2544 -1,0 +1,2545 @@@
-     gmx_nbat_alloc_t *nb_alloc;
-     gmx_nbat_free_t  *nb_free;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "tables.h"
 +#include "nonbonded.h"
 +#include "invblock.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "md_support.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "copyrite.h"
 +#include "mtop_util.h"
 +#include "nbnxn_search.h"
++#include "nbnxn_atomdata.h"
 +#include "nbnxn_consts.h"
 +#include "statutil.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +#endif
 +
 +#include "types/nbnxn_cuda_types_ext.h"
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +
 +t_forcerec *mk_forcerec(void)
 +{
 +  t_forcerec *fr;
 +  
 +  snew(fr,1);
 +  
 +  return fr;
 +}
 +
 +#ifdef DEBUG
 +static void pr_nbfp(FILE *fp,real *nbfp,gmx_bool bBHAM,int atnr)
 +{
 +  int i,j;
 +  
 +  for(i=0; (i<atnr); i++) {
 +    for(j=0; (j<atnr); j++) {
 +      fprintf(fp,"%2d - %2d",i,j);
 +      if (bBHAM)
 +      fprintf(fp,"  a=%10g, b=%10g, c=%10g\n",BHAMA(nbfp,atnr,i,j),
 +              BHAMB(nbfp,atnr,i,j),BHAMC(nbfp,atnr,i,j));
 +      else
 +      fprintf(fp,"  c6=%10g, c12=%10g\n",C6(nbfp,atnr,i,j),
 +              C12(nbfp,atnr,i,j));
 +    }
 +  }
 +}
 +#endif
 +
 +static real *mk_nbfp(const gmx_ffparams_t *idef,gmx_bool bBHAM)
 +{
 +  real *nbfp;
 +  int  i,j,k,atnr;
 +  
 +  atnr=idef->atnr;
 +  if (bBHAM) {
 +    snew(nbfp,3*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +      BHAMA(nbfp,atnr,i,j) = idef->iparams[k].bham.a;
 +      BHAMB(nbfp,atnr,i,j) = idef->iparams[k].bham.b;
 +      BHAMC(nbfp,atnr,i,j) = idef->iparams[k].bham.c;
 +      }
 +    }
 +  }
 +  else {
 +    snew(nbfp,2*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +      C6(nbfp,atnr,i,j)   = idef->iparams[k].lj.c6;
 +      C12(nbfp,atnr,i,j)  = idef->iparams[k].lj.c12;
 +      }
 +    }
 +  }
 +  return nbfp;
 +}
 +
 +/* This routine sets fr->solvent_opt to the most common solvent in the 
 + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in 
 + * the fr->solvent_type array with the correct type (or esolNO).
 + *
 + * Charge groups that fulfill the conditions but are not identical to the
 + * most common one will be marked as esolNO in the solvent_type array. 
 + *
 + * TIP3p is identical to SPC for these purposes, so we call it
 + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 + * 
 + * NOTE: QM particle should not
 + * become an optimized solvent. Not even if there is only one charge
 + * group in the Qm 
 + */
 +
 +typedef struct 
 +{
 +    int    model;          
 +    int    count;
 +    int    vdwtype[4];
 +    real   charge[4];
 +} solvent_parameters_t;
 +
 +static void
 +check_solvent_cg(const gmx_moltype_t   *molt,
 +                 int                   cg0,
 +                 int                   nmol,
 +                 const unsigned char   *qm_grpnr,
 +                 const t_grps          *qm_grps,
 +                 t_forcerec *          fr,
 +                 int                   *n_solvent_parameters,
 +                 solvent_parameters_t  **solvent_parameters_p,
 +                 int                   cginfo,
 +                 int                   *cg_sp)
 +{
 +    const t_blocka *  excl;
 +    t_atom            *atom;
 +    int               j,k;
 +    int               j0,j1,nj;
 +    gmx_bool              perturbed;
 +    gmx_bool              has_vdw[4];
 +    gmx_bool              match;
 +    real              tmp_charge[4];
 +    int               tmp_vdwtype[4];
 +    int               tjA;
 +    gmx_bool              qm;
 +    solvent_parameters_t *solvent_parameters;
 +
 +    /* We use a list with parameters for each solvent type. 
 +     * Every time we discover a new molecule that fulfills the basic 
 +     * conditions for a solvent we compare with the previous entries
 +     * in these lists. If the parameters are the same we just increment
 +     * the counter for that type, and otherwise we create a new type
 +     * based on the current molecule.
 +     *
 +     * Once we've finished going through all molecules we check which
 +     * solvent is most common, and mark all those molecules while we
 +     * clear the flag on all others.
 +     */   
 +
 +    solvent_parameters = *solvent_parameters_p;
 +
 +    /* Mark the cg first as non optimized */
 +    *cg_sp = -1;
 +    
 +    /* Check if this cg has no exclusions with atoms in other charge groups
 +     * and all atoms inside the charge group excluded.
 +     * We only have 3 or 4 atom solvent loops.
 +     */
 +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
 +        !GET_CGINFO_EXCL_INTRA(cginfo))
 +    {
 +        return;
 +    }
 +
 +    /* Get the indices of the first atom in this charge group */
 +    j0     = molt->cgs.index[cg0];
 +    j1     = molt->cgs.index[cg0+1];
 +    
 +    /* Number of atoms in our molecule */
 +    nj     = j1 - j0;
 +
 +    if (debug) {
 +        fprintf(debug,
 +                "Moltype '%s': there are %d atoms in this charge group\n",
 +                *molt->name,nj);
 +    }
 +    
 +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 +     * otherwise skip it.
 +     */
 +    if (nj<3 || nj>4)
 +    {
 +        return;
 +    }
 +    
 +    /* Check if we are doing QM on this group */
 +    qm = FALSE; 
 +    if (qm_grpnr != NULL)
 +    {
 +        for(j=j0 ; j<j1 && !qm; j++)
 +        {
 +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
 +        }
 +    }
 +    /* Cannot use solvent optimization with QM */
 +    if (qm)
 +    {
 +        return;
 +    }
 +    
 +    atom = molt->atoms.atom;
 +
 +    /* Still looks like a solvent, time to check parameters */
 +    
 +    /* If it is perturbed (free energy) we can't use the solvent loops,
 +     * so then we just skip to the next molecule.
 +     */   
 +    perturbed = FALSE; 
 +    
 +    for(j=j0; j<j1 && !perturbed; j++)
 +    {
 +        perturbed = PERTURBED(atom[j]);
 +    }
 +    
 +    if (perturbed)
 +    {
 +        return;
 +    }
 +    
 +    /* Now it's only a question if the VdW and charge parameters 
 +     * are OK. Before doing the check we compare and see if they are 
 +     * identical to a possible previous solvent type.
 +     * First we assign the current types and charges.    
 +     */
 +    for(j=0; j<nj; j++)
 +    {
 +        tmp_vdwtype[j] = atom[j0+j].type;
 +        tmp_charge[j]  = atom[j0+j].q;
 +    } 
 +    
 +    /* Does it match any previous solvent type? */
 +    for(k=0 ; k<*n_solvent_parameters; k++)
 +    {
 +        match = TRUE;
 +        
 +        
 +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 +        if( (solvent_parameters[k].model==esolSPC   && nj!=3)  ||
 +            (solvent_parameters[k].model==esolTIP4P && nj!=4) )
 +            match = FALSE;
 +        
 +        /* Check that types & charges match for all atoms in molecule */
 +        for(j=0 ; j<nj && match==TRUE; j++)
 +        {                     
 +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 +            {
 +                match = FALSE;
 +            }
 +            if(tmp_charge[j] != solvent_parameters[k].charge[j])
 +            {
 +                match = FALSE;
 +            }
 +        }
 +        if (match == TRUE)
 +        {
 +            /* Congratulations! We have a matched solvent.
 +             * Flag it with this type for later processing.
 +             */
 +            *cg_sp = k;
 +            solvent_parameters[k].count += nmol;
 +
 +            /* We are done with this charge group */
 +            return;
 +        }
 +    }
 +    
 +    /* If we get here, we have a tentative new solvent type.
 +     * Before we add it we must check that it fulfills the requirements
 +     * of the solvent optimized loops. First determine which atoms have
 +     * VdW interactions.   
 +     */
 +    for(j=0; j<nj; j++) 
 +    {
 +        has_vdw[j] = FALSE;
 +        tjA        = tmp_vdwtype[j];
 +        
 +        /* Go through all other tpes and see if any have non-zero
 +         * VdW parameters when combined with this one.
 +         */   
 +        for(k=0; k<fr->ntype && (has_vdw[j]==FALSE); k++)
 +        {
 +            /* We already checked that the atoms weren't perturbed,
 +             * so we only need to check state A now.
 +             */ 
 +            if (fr->bBHAM) 
 +            {
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (BHAMA(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMB(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMC(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +            else
 +            {
 +                /* Standard LJ */
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (C6(fr->nbfp,fr->ntype,tjA,k)  != 0.0) ||
 +                              (C12(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +        }
 +    }
 +    
 +    /* Now we know all we need to make the final check and assignment. */
 +    if (nj == 3)
 +    {
 +        /* So, is it an SPC?
 +         * For this we require thatn all atoms have charge, 
 +         * the charges on atom 2 & 3 should be the same, and only
 +         * atom 1 should have VdW.
 +         */
 +        if (has_vdw[0] == TRUE && 
 +            has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            tmp_charge[0]  != 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1])
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<3;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +    else if (nj==4)
 +    {
 +        /* Or could it be a TIP4P?
 +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1. 
 +         * Only atom 1 should have VdW.
 +         */
 +        if(has_vdw[0] == TRUE && 
 +           has_vdw[1] == FALSE &&
 +           has_vdw[2] == FALSE &&
 +           has_vdw[3] == FALSE &&
 +           tmp_charge[0]  == 0 &&
 +           tmp_charge[1]  != 0 &&
 +           tmp_charge[2]  == tmp_charge[1] &&
 +           tmp_charge[3]  != 0)
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<4;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +            
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +
 +    *solvent_parameters_p = solvent_parameters;
 +}
 +
 +static void
 +check_solvent(FILE *                fp,
 +              const gmx_mtop_t *    mtop,
 +              t_forcerec *          fr,
 +              cginfo_mb_t           *cginfo_mb)
 +{
 +    const t_block *   cgs;
 +    const t_block *   mols;
 +    const gmx_moltype_t *molt;
 +    int               mb,mol,cg_mol,at_offset,cg_offset,am,cgm,i,nmol_ch,nmol;
 +    int               n_solvent_parameters;
 +    solvent_parameters_t *solvent_parameters;
 +    int               **cg_sp;
 +    int               bestsp,bestsol;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Going to determine what solvent types we have.\n");
 +    }
 +
 +    mols = &mtop->mols;
 +
 +    n_solvent_parameters = 0;
 +    solvent_parameters = NULL;
 +    /* Allocate temporary array for solvent type */
 +    snew(cg_sp,mtop->nmolblock);
 +
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        cgs  = &molt->cgs;
 +        /* Here we have to loop over all individual molecules
 +         * because we need to check for QMMM particles.
 +         */
 +        snew(cg_sp[mb],cginfo_mb[mb].cg_mod);
 +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
 +        for(mol=0; mol<nmol_ch; mol++)
 +        {
 +            cgm = mol*cgs->nr;
 +            am  = mol*cgs->index[cgs->nr];
 +            for(cg_mol=0; cg_mol<cgs->nr; cg_mol++)
 +            {
 +                check_solvent_cg(molt,cg_mol,nmol,
 +                                 mtop->groups.grpnr[egcQMMM] ?
 +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 +                                 &mtop->groups.grps[egcQMMM],
 +                                 fr,
 +                                 &n_solvent_parameters,&solvent_parameters,
 +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
 +                                 &cg_sp[mb][cgm+cg_mol]);
 +            }
 +        }
 +        cg_offset += cgs->nr;
 +        at_offset += cgs->index[cgs->nr];
 +    }
 +
 +    /* Puh! We finished going through all charge groups.
 +     * Now find the most common solvent model.
 +     */   
 +    
 +    /* Most common solvent this far */
 +    bestsp = -2;
 +    for(i=0;i<n_solvent_parameters;i++)
 +    {
 +        if (bestsp == -2 ||
 +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
 +        {
 +            bestsp = i;
 +        }
 +    }
 +    
 +    if (bestsp >= 0)
 +    {
 +        bestsol = solvent_parameters[bestsp].model;
 +    }
 +    else
 +    {
 +        bestsol = esolNO;
 +    }
 +    
 +#ifdef DISABLE_WATER_NLIST
 +      bestsol = esolNO;
 +#endif
 +
 +    fr->nWatMol = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 +        for(i=0; i<cginfo_mb[mb].cg_mod; i++)
 +        {
 +            if (cg_sp[mb][i] == bestsp)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],bestsol);
 +                fr->nWatMol += nmol;
 +            }
 +            else
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],esolNO);
 +            }
 +        }
 +        sfree(cg_sp[mb]);
 +    }
 +    sfree(cg_sp);
 +    
 +    if (bestsol != esolNO && fp!=NULL)
 +    {
 +        fprintf(fp,"\nEnabling %s-like water optimization for %d molecules.\n\n",
 +                esol_names[bestsol],
 +                solvent_parameters[bestsp].count);
 +    }
 +
 +    sfree(solvent_parameters);
 +    fr->solvent_opt = bestsol;
 +}
 +
 +enum { acNONE=0, acCONSTRAINT, acSETTLE };
 +
 +static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
 +                                   t_forcerec *fr,gmx_bool bNoSolvOpt,
 +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 +{
 +    const t_block *cgs;
 +    const t_blocka *excl;
 +    const gmx_moltype_t *molt;
 +    const gmx_molblock_t *molb;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool *type_VDW;
 +    int  *cginfo;
 +    int  cg_offset,a_offset,cgm,am;
 +    int  mb,m,ncg_tot,cg,a0,a1,gid,ai,j,aj,excl_nalloc;
 +    int  *a_con;
 +    int  ftype;
 +    int  ia;
 +    gmx_bool bId,*bExcl,bExclIntraAll,bExclInter,bHaveVDW,bHaveQ;
 +
 +    ncg_tot = ncg_mtop(mtop);
 +    snew(cginfo_mb,mtop->nmolblock);
 +
 +    snew(type_VDW,fr->ntype);
 +    for(ai=0; ai<fr->ntype; ai++)
 +    {
 +        type_VDW[ai] = FALSE;
 +        for(j=0; j<fr->ntype; j++)
 +        {
 +            type_VDW[ai] = type_VDW[ai] ||
 +                fr->bBHAM ||
 +                C6(fr->nbfp,fr->ntype,ai,j) != 0 ||
 +                C12(fr->nbfp,fr->ntype,ai,j) != 0;
 +        }
 +    }
 +
 +    *bExcl_IntraCGAll_InterCGNone = TRUE;
 +
 +    excl_nalloc = 10;
 +    snew(bExcl,excl_nalloc);
 +    cg_offset = 0;
 +    a_offset  = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        cgs  = &molt->cgs;
 +        excl = &molt->excls;
 +
 +        /* Check if the cginfo is identical for all molecules in this block.
 +         * If so, we only need an array of the size of one molecule.
 +         * Otherwise we make an array of #mol times #cgs per molecule.
 +         */
 +        bId = TRUE;
 +        am = 0;
 +        for(m=0; m<molb->nmol; m++)
 +        {
 +            am = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                if (ggrpnr(&mtop->groups,egcENER,a_offset+am+a0) !=
 +                    ggrpnr(&mtop->groups,egcENER,a_offset   +a0))
 +                {
 +                    bId = FALSE;
 +                }
 +                if (mtop->groups.grpnr[egcQMMM] != NULL)
 +                {
 +                    for(ai=a0; ai<a1; ai++)
 +                    {
 +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 +                        {
 +                            bId = FALSE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        cginfo_mb[mb].cg_start = cg_offset;
 +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 +        snew(cginfo_mb[mb].cginfo,cginfo_mb[mb].cg_mod);
 +        cginfo = cginfo_mb[mb].cginfo;
 +
 +        /* Set constraints flags for constrained atoms */
 +        snew(a_con,molt->atoms.nr);
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
 +            {
 +                int nral;
 +
 +                nral = NRAL(ftype);
 +                for(ia=0; ia<molt->ilist[ftype].nr; ia+=1+nral)
 +                {
 +                    int a;
 +
 +                    for(a=0; a<nral; a++)
 +                    {
 +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 +                    }
 +                }
 +            }
 +        }
 +
 +        for(m=0; m<(bId ? 1 : molb->nmol); m++)
 +        {
 +            cgm = m*cgs->nr;
 +            am  = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +
 +                /* Store the energy group in cginfo */
 +                gid = ggrpnr(&mtop->groups,egcENER,a_offset+am+a0);
 +                SET_CGINFO_GID(cginfo[cgm+cg],gid);
 +                
 +                /* Check the intra/inter charge group exclusions */
 +                if (a1-a0 > excl_nalloc) {
 +                    excl_nalloc = a1 - a0;
 +                    srenew(bExcl,excl_nalloc);
 +                }
 +                /* bExclIntraAll: all intra cg interactions excluded
 +                 * bExclInter:    any inter cg interactions excluded
 +                 */
 +                bExclIntraAll = TRUE;
 +                bExclInter    = FALSE;
 +                bHaveVDW      = FALSE;
 +                bHaveQ        = FALSE;
 +                for(ai=a0; ai<a1; ai++)
 +                {
 +                    /* Check VDW and electrostatic interactions */
 +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 +                                            type_VDW[molt->atoms.atom[ai].typeB]);
 +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 +                                            molt->atoms.atom[ai].qB != 0);
 +
 +                    /* Clear the exclusion list for atom ai */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        bExcl[aj-a0] = FALSE;
 +                    }
 +                    /* Loop over all the exclusions of atom ai */
 +                    for(j=excl->index[ai]; j<excl->index[ai+1]; j++)
 +                    {
 +                        aj = excl->a[j];
 +                        if (aj < a0 || aj >= a1)
 +                        {
 +                            bExclInter = TRUE;
 +                        }
 +                        else
 +                        {
 +                            bExcl[aj-a0] = TRUE;
 +                        }
 +                    }
 +                    /* Check if ai excludes a0 to a1 */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        if (!bExcl[aj-a0])
 +                        {
 +                            bExclIntraAll = FALSE;
 +                        }
 +                    }
 +
 +                    switch (a_con[ai])
 +                    {
 +                    case acCONSTRAINT:
 +                        SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 +                        break;
 +                    case acSETTLE:
 +                        SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 +                        break;
 +                    default:
 +                        break;
 +                    }
 +                }
 +                if (bExclIntraAll)
 +                {
 +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 +                }
 +                if (bExclInter)
 +                {
 +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 +                }
 +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 +                {
 +                    /* The size in cginfo is currently only read with DD */
 +                    gmx_fatal(FARGS,"A charge group has size %d which is larger than the limit of %d atoms",a1-a0,MAX_CHARGEGROUP_SIZE);
 +                }
 +                if (bHaveVDW)
 +                {
 +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 +                }
 +                if (bHaveQ)
 +                {
 +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 +                }
 +                /* Store the charge group size */
 +                SET_CGINFO_NATOMS(cginfo[cgm+cg],a1-a0);
 +
 +                if (!bExclIntraAll || bExclInter)
 +                {
 +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
 +                }
 +            }
 +        }
 +
 +        sfree(a_con);
 +
 +        cg_offset += molb->nmol*cgs->nr;
 +        a_offset  += molb->nmol*cgs->index[cgs->nr];
 +    }
 +    sfree(bExcl);
 +    
 +    /* the solvent optimizer is called after the QM is initialized,
 +     * because we don't want to have the QM subsystemto become an
 +     * optimized solvent
 +     */
 +
 +    check_solvent(fplog,mtop,fr,cginfo_mb);
 +    
 +    if (getenv("GMX_NO_SOLV_OPT"))
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found environment variable GMX_NO_SOLV_OPT.\n"
 +                    "Disabling all solvent optimization\n");
 +        }
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (bNoSolvOpt)
 +    {
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (!fr->solvent_opt)
 +    {
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            for(cg=0; cg<cginfo_mb[mb].cg_mod; cg++)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg],esolNO);
 +            }
 +        }
 +    }
 +    
 +    return cginfo_mb;
 +}
 +
 +static int *cginfo_expand(int nmb,cginfo_mb_t *cgi_mb)
 +{
 +    int ncg,mb,cg;
 +    int *cginfo;
 +
 +    ncg = cgi_mb[nmb-1].cg_end;
 +    snew(cginfo,ncg);
 +    mb = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        while (cg >= cgi_mb[mb].cg_end)
 +        {
 +            mb++;
 +        }
 +        cginfo[cg] =
 +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 +    }
 +
 +    return cginfo;
 +}
 +
 +static void set_chargesum(FILE *log,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    double qsum,q2sum,q;
 +    int    mb,nmol,i;
 +    const t_atoms *atoms;
 +    
 +    qsum  = 0;
 +    q2sum = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        nmol  = mtop->molblock[mb].nmol;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        for(i=0; i<atoms->nr; i++)
 +        {
 +            q = atoms->atom[i].q;
 +            qsum  += nmol*q;
 +            q2sum += nmol*q*q;
 +        }
 +    }
 +    fr->qsum[0]  = qsum;
 +    fr->q2sum[0] = q2sum;
 +    if (fr->efep != efepNO)
 +    {
 +        qsum  = 0;
 +        q2sum = 0;
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            nmol  = mtop->molblock[mb].nmol;
 +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +            for(i=0; i<atoms->nr; i++)
 +            {
 +                q = atoms->atom[i].qB;
 +                qsum  += nmol*q;
 +                q2sum += nmol*q*q;
 +            }
 +            fr->qsum[1]  = qsum;
 +            fr->q2sum[1] = q2sum;
 +        }
 +    }
 +    else
 +    {
 +        fr->qsum[1]  = fr->qsum[0];
 +        fr->q2sum[1] = fr->q2sum[0];
 +    }
 +    if (log) {
 +        if (fr->efep == efepNO)
 +            fprintf(log,"System total charge: %.3f\n",fr->qsum[0]);
 +        else
 +            fprintf(log,"System total charge, top. A: %.3f top. B: %.3f\n",
 +                    fr->qsum[0],fr->qsum[1]);
 +    }
 +}
 +
 +void update_forcerec(FILE *log,t_forcerec *fr,matrix box)
 +{
 +    if (fr->eeltype == eelGRF)
 +    {
 +        calc_rffac(NULL,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    }
 +}
 +
 +void set_avcsixtwelve(FILE *fplog,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *atoms,*atoms_tpi;
 +    const t_blocka *excl;
 +    int    mb,nmol,nmolc,i,j,tpi,tpj,j1,j2,k,n,nexcl,q;
 +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)    
 +    long long int  npair,npair_ij,tmpi,tmpj;
 +#else
 +    double npair, npair_ij,tmpi,tmpj;
 +#endif
 +    double csix,ctwelve;
 +    int    ntp,*typecount;
 +    gmx_bool   bBHAM;
 +    real   *nbfp;
 +
 +    ntp = fr->ntype;
 +    bBHAM = fr->bBHAM;
 +    nbfp = fr->nbfp;
 +    
 +    for(q=0; q<(fr->efep==efepNO ? 1 : 2); q++) {
 +        csix = 0;
 +        ctwelve = 0;
 +        npair = 0;
 +        nexcl = 0;
 +        if (!fr->n_tpi) {
 +            /* Count the types so we avoid natoms^2 operations */
 +            snew(typecount,ntp);
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(i=0; i<atoms->nr; i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    typecount[tpi] += nmol;
 +                }
 +            }
 +            for(tpi=0; tpi<ntp; tpi++) {
 +                for(tpj=tpi; tpj<ntp; tpj++) {
 +                    tmpi = typecount[tpi];
 +                    tmpj = typecount[tpj];
 +                    if (tpi != tpj)
 +                    {
 +                        npair_ij = tmpi*tmpj;
 +                    }
 +                    else
 +                    {
 +                        npair_ij = tmpi*(tmpi - 1)/2;
 +                    }
 +                    if (bBHAM) {
 +                        csix    += npair_ij*BHAMC(nbfp,ntp,tpi,tpj);
 +                    } else {
 +                        csix    += npair_ij*   C6(nbfp,ntp,tpi,tpj);
 +                        ctwelve += npair_ij*  C12(nbfp,ntp,tpi,tpj);
 +                    }
 +                    npair += npair_ij;
 +                }
 +            }
 +            sfree(typecount);
 +            /* Subtract the excluded pairs.
 +             * The main reason for substracting exclusions is that in some cases
 +             * some combinations might never occur and the parameters could have
 +             * any value. These unused values should not influence the dispersion
 +             * correction.
 +             */
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
 +                for(i=0; (i<atoms->nr); i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    j1  = excl->index[i];
 +                    j2  = excl->index[i+1];
 +                    for(j=j1; j<j2; j++) {
 +                        k = excl->a[j];
 +                        if (k > i)
 +                        {
 +                            if (q == 0)
 +                            {
 +                                tpj = atoms->atom[k].type;
 +                            }
 +                            else
 +                            {
 +                                tpj = atoms->atom[k].typeB;
 +                            }
 +                            if (bBHAM) {
 +                               csix -= nmol*BHAMC(nbfp,ntp,tpi,tpj);
 +                            } else {
 +                                csix    -= nmol*C6 (nbfp,ntp,tpi,tpj);
 +                                ctwelve -= nmol*C12(nbfp,ntp,tpi,tpj);
 +                            }
 +                            nexcl += nmol;
 +                        }
 +                    }
 +                }
 +            }
 +        } else {
 +            /* Only correct for the interaction of the test particle
 +             * with the rest of the system.
 +             */
 +            atoms_tpi =
 +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
 +
 +            npair = 0;
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(j=0; j<atoms->nr; j++) {
 +                    nmolc = nmol;
 +                    /* Remove the interaction of the test charge group
 +                     * with itself.
 +                     */
 +                    if (mb == mtop->nmolblock-1)
 +                    {
 +                        nmolc--;
 +                        
 +                        if (mb == 0 && nmol == 1)
 +                        {
 +                            gmx_fatal(FARGS,"Old format tpr with TPI, please generate a new tpr file");
 +                        }
 +                    }
 +                    if (q == 0)
 +                    {
 +                        tpj = atoms->atom[j].type;
 +                    }
 +                    else
 +                    {
 +                        tpj = atoms->atom[j].typeB;
 +                    }
 +                    for(i=0; i<fr->n_tpi; i++)
 +                    {
 +                        if (q == 0)
 +                        {
 +                            tpi = atoms_tpi->atom[i].type;
 +                        }
 +                        else
 +                        {
 +                            tpi = atoms_tpi->atom[i].typeB;
 +                        }
 +                        if (bBHAM)
 +                        {
 +                            csix    += nmolc*BHAMC(nbfp,ntp,tpi,tpj);
 +                        }
 +                        else
 +                        {
 +                            csix    += nmolc*C6 (nbfp,ntp,tpi,tpj);
 +                            ctwelve += nmolc*C12(nbfp,ntp,tpi,tpj);
 +                        }
 +                        npair += nmolc;
 +                    }
 +                }
 +            }
 +        }
 +        if (npair - nexcl <= 0 && fplog) {
 +            fprintf(fplog,"\nWARNING: There are no atom pairs for dispersion correction\n\n");
 +            csix     = 0;
 +            ctwelve  = 0;
 +        } else {
 +            csix    /= npair - nexcl;
 +            ctwelve /= npair - nexcl;
 +        }
 +        if (debug) {
 +            fprintf(debug,"Counted %d exclusions\n",nexcl);
 +            fprintf(debug,"Average C6 parameter is: %10g\n",(double)csix);
 +            fprintf(debug,"Average C12 parameter is: %10g\n",(double)ctwelve);
 +        }
 +        fr->avcsix[q]    = csix;
 +        fr->avctwelve[q] = ctwelve;
 +    }
 +    if (fplog != NULL)
 +    {
 +        if (fr->eDispCorr == edispcAllEner ||
 +            fr->eDispCorr == edispcAllEnerPres)
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                    fr->avcsix[0],fr->avctwelve[0]);
 +        }
 +        else
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e\n",fr->avcsix[0]);
 +        }
 +    }
 +}
 +
 +
 +static void set_bham_b_max(FILE *fplog,t_forcerec *fr,
 +                           const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *at1,*at2;
 +    int  mt1,mt2,i,j,tpi,tpj,ntypes;
 +    real b,bmin;
 +    real *nbfp;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Determining largest Buckingham b parameter for table\n");
 +    }
 +    nbfp   = fr->nbfp;
 +    ntypes = fr->ntype;
 +    
 +    bmin           = -1;
 +    fr->bham_b_max = 0;
 +    for(mt1=0; mt1<mtop->nmoltype; mt1++)
 +    {
 +        at1 = &mtop->moltype[mt1].atoms;
 +        for(i=0; (i<at1->nr); i++)
 +        {
 +            tpi = at1->atom[i].type;
 +            if (tpi >= ntypes)
 +                gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",i,tpi,ntypes);
 +            
 +            for(mt2=mt1; mt2<mtop->nmoltype; mt2++)
 +            {
 +                at2 = &mtop->moltype[mt2].atoms;
 +                for(j=0; (j<at2->nr); j++) {
 +                    tpj = at2->atom[j].type;
 +                    if (tpj >= ntypes)
 +                    {
 +                        gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",j,tpj,ntypes);
 +                    }
 +                    b = BHAMB(nbfp,ntypes,tpi,tpj);
 +                    if (b > fr->bham_b_max)
 +                    {
 +                        fr->bham_b_max = b;
 +                    }
 +                    if ((b < bmin) || (bmin==-1))
 +                    {
 +                        bmin = b;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Buckingham b parameters, min: %g, max: %g\n",
 +                bmin,fr->bham_b_max);
 +    }
 +}
 +
 +static void make_nbf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,real rtab,
 +                          const t_commrec *cr,
 +                          const char *tabfn,char *eg1,char *eg2,
 +                          t_nblists *nbl)
 +{
 +  char buf[STRLEN];
 +  int i,j;
 +
 +  if (tabfn == NULL) {
 +    if (debug)
 +      fprintf(debug,"No table file name passed, can not read table, can not do non-bonded interactions\n");
 +    return;
 +  }
 +    
 +  sprintf(buf,"%s",tabfn);
 +  if (eg1 && eg2)
 +    /* Append the two energy group names */
 +    sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"_%s_%s.%s",
 +          eg1,eg2,ftp2ext(efXVG));
 +  nbl->tab = make_tables(fp,oenv,fr,MASTER(cr),buf,rtab,0);
 +  /* Copy the contents of the table to separate coulomb and LJ tables too,
 +   * to improve cache performance.
 +   */
 +
 +  /* For performance reasons we want
 +   * the table data to be aligned to 16-byte. The pointer could be freed
 +   * but currently isn't.
 +   */
 +  snew_aligned(nbl->vdwtab,8*(nbl->tab.n+1),16);
 +  snew_aligned(nbl->coultab,4*(nbl->tab.n+1),16);
 +  
 +  for(i=0; i<=nbl->tab.n; i++) {
 +    for(j=0; j<4; j++)
 +      nbl->coultab[4*i+j] = nbl->tab.tab[12*i+j];
 +    for(j=0; j<8; j++)
 +      nbl->vdwtab [8*i+j] = nbl->tab.tab[12*i+4+j];
 +  }
 +}
 +
 +static void count_tables(int ftype1,int ftype2,const gmx_mtop_t *mtop,
 +                         int *ncount,int **count)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_ilist *il;
 +    int mt,ftype,stride,i,j,tabnr;
 +    
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (ftype == ftype1 || ftype == ftype2) {
 +                il = &molt->ilist[ftype];
 +                stride = 1 + NRAL(ftype);
 +                for(i=0; i<il->nr; i+=stride) {
 +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
 +                    if (tabnr < 0)
 +                        gmx_fatal(FARGS,"A bonded table number is smaller than 0: %d\n",tabnr);
 +                    if (tabnr >= *ncount) {
 +                        srenew(*count,tabnr+1);
 +                        for(j=*ncount; j<tabnr+1; j++)
 +                            (*count)[j] = 0;
 +                        *ncount = tabnr+1;
 +                    }
 +                    (*count)[tabnr]++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static bondedtable_t *make_bonded_tables(FILE *fplog,
 +                                         int ftype1,int ftype2,
 +                                         const gmx_mtop_t *mtop,
 +                                         const char *basefn,const char *tabext)
 +{
 +    int  i,ncount,*count;
 +    char tabfn[STRLEN];
 +    bondedtable_t *tab;
 +    
 +    tab = NULL;
 +    
 +    ncount = 0;
 +    count = NULL;
 +    count_tables(ftype1,ftype2,mtop,&ncount,&count);
 +    
 +    if (ncount > 0) {
 +        snew(tab,ncount);
 +        for(i=0; i<ncount; i++) {
 +            if (count[i] > 0) {
 +                sprintf(tabfn,"%s",basefn);
 +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1,"_%s%d.%s",
 +                        tabext,i,ftp2ext(efXVG));
 +                tab[i] = make_bonded_table(fplog,tabfn,NRAL(ftype1)-2);
 +            }
 +        }
 +        sfree(count);
 +    }
 +  
 +    return tab;
 +}
 +
 +void forcerec_set_ranges(t_forcerec *fr,
 +                         int ncg_home,int ncg_force,
 +                         int natoms_force,
 +                         int natoms_force_constr,int natoms_f_novirsum)
 +{
 +    fr->cg0 = 0;
 +    fr->hcg = ncg_home;
 +
 +    /* fr->ncg_force is unused in the standard code,
 +     * but it can be useful for modified code dealing with charge groups.
 +     */
 +    fr->ncg_force           = ncg_force;
 +    fr->natoms_force        = natoms_force;
 +    fr->natoms_force_constr = natoms_force_constr;
 +
 +    if (fr->natoms_force_constr > fr->nalloc_force)
 +    {
 +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 +
 +        if (fr->bTwinRange)
 +        {
 +            srenew(fr->f_twin,fr->nalloc_force);
 +        }
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        fr->f_novirsum_n = natoms_f_novirsum;
 +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
 +        {
 +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
 +            srenew(fr->f_novirsum_alloc,fr->f_novirsum_nalloc);
 +        }
 +    }
 +    else
 +    {
 +        fr->f_novirsum_n = 0;
 +    }
 +}
 +
 +static real cutoff_inf(real cutoff)
 +{
 +    if (cutoff == 0)
 +    {
 +        cutoff = GMX_CUTOFF_INF;
 +    }
 +
 +    return cutoff;
 +}
 +
 +static void make_adress_tf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,const t_inputrec *ir,
 +                          const char *tabfn, const gmx_mtop_t *mtop,
 +                            matrix     box)
 +{
 +  char buf[STRLEN];
 +  int i,j;
 +
 +  if (tabfn == NULL) {
 +        gmx_fatal(FARGS,"No thermoforce table file given. Use -tabletf to specify a file\n");
 +    return;
 +  }
 +
 +  snew(fr->atf_tabs, ir->adress->n_tf_grps);
 +
 +  for (i=0; i<ir->adress->n_tf_grps; i++){
 +    j = ir->adress->tf_table_index[i]; /* get energy group index */
 +    sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"tf_%s.%s",
 +        *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]) ,ftp2ext(efXVG));
 +    printf("loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[j], buf);
 +    fr->atf_tabs[i] = make_atf_table(fp,oenv,fr,buf, box);
 +  }
 +
 +}
 +
 +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 +                      gmx_bool bPrintNote,t_commrec *cr,FILE *fp)
 +{
 +    gmx_bool bAllvsAll;
 +
 +    bAllvsAll =
 +        (
 +         ir->rlist==0            &&
 +         ir->rcoulomb==0         &&
 +         ir->rvdw==0             &&
 +         ir->ePBC==epbcNONE      &&
 +         ir->vdwtype==evdwCUT    &&
 +         ir->coulombtype==eelCUT &&
 +         ir->efep==efepNO        &&
 +         (ir->implicit_solvent == eisNO || 
 +          (ir->implicit_solvent==eisGBSA && (ir->gb_algorithm==egbSTILL || 
 +                                             ir->gb_algorithm==egbHCT   || 
 +                                             ir->gb_algorithm==egbOBC))) &&
 +         getenv("GMX_NO_ALLVSALL") == NULL
 +            );
 +    
 +    if (bAllvsAll && ir->opts.ngener > 1)
 +    {
 +        const char *note="NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
 +
 +        if (bPrintNote)
 +        {
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr,"\n%s\n",note);
 +            }
 +            if (fp != NULL)
 +            {
 +                fprintf(fp,"\n%s\n",note);
 +            }
 +        }
 +        bAllvsAll = FALSE;
 +    }
 +
 +    if(bAllvsAll && fp && MASTER(cr))
 +    {
 +        fprintf(fp,"\nUsing accelerated all-vs-all kernels.\n\n");
 +    }
 +    
 +    return bAllvsAll;
 +}
 +
 +
 +static void init_forcerec_f_threads(t_forcerec *fr,int grpp_nener)
 +{
 +    int t,i;
 +
 +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 +
 +    if (fr->nthreads > 1)
 +    {
 +        snew(fr->f_t,fr->nthreads);
 +        /* Thread 0 uses the global force and energy arrays */
 +        for(t=1; t<fr->nthreads; t++)
 +        {
 +            fr->f_t[t].f = NULL;
 +            fr->f_t[t].f_nalloc = 0;
 +            snew(fr->f_t[t].fshift,SHIFTS);
 +            /* snew(fr->f_t[t].ener,F_NRE); */
 +            fr->f_t[t].grpp.nener = grpp_nener;
 +            for(i=0; i<egNR; i++)
 +            {
 +                snew(fr->f_t[t].grpp.ener[i],grpp_nener);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void pick_nbnxn_kernel_cpu(FILE *fp,
 +                                  const t_commrec *cr,
 +                                  const gmx_cpuid_t cpuid_info,
 +                                  int *kernel_type)
 +{
 +    *kernel_type = nbk4x4_PlainC;
 +
 +#ifdef GMX_X86_SSE2
 +    {
 +        /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
 +         * On AMD Bulldozer AVX-256 is much slower than AVX-128.
 +         */
 +        if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
 +           gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
 +        {
 +#ifdef GMX_X86_AVX_256
 +            *kernel_type = nbk4xN_X86_SIMD256;
 +#else
 +            *kernel_type = nbk4xN_X86_SIMD128;
 +#endif
 +        }
 +        else
 +        {
 +            *kernel_type = nbk4xN_X86_SIMD128;
 +        }
 +
 +        if (getenv("GMX_NBNXN_AVX128") != NULL)
 +        {
 +            *kernel_type = nbk4xN_X86_SIMD128;
 +        }
 +        if (getenv("GMX_NBNXN_AVX256") != NULL)
 +        {
 +#ifdef GMX_X86_AVX_256
 +            *kernel_type = nbk4xN_X86_SIMD256;
 +#else
 +            gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
 +#endif
 +        }
 +    }
 +#endif /* GMX_X86_SSE2 */
 +}
 +
 +
 +/* Note that _mm_... intrinsics can be converted to either SSE or AVX
 + * depending on compiler flags.
 + * For gcc we check for __AVX__
 + * At least a check for icc should be added (if there is a macro)
 + */
 +static const char *nbk_name[] =
 +  { "not set", "plain C 4x4",
 +#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
 +#ifndef GMX_X86_SSE4_1
 +#ifndef GMX_DOUBLE
 +    "SSE2 4x4",
 +#else
 +    "SSE2 4x2",
 +#endif
 +#else
 +#ifndef GMX_DOUBLE
 +    "SSE4.1 4x4",
 +#else
 +    "SSE4.1 4x2",
 +#endif
 +#endif
 +#else
 +#ifndef GMX_DOUBLE
 +    "AVX-128 4x4",
 +#else
 +    "AVX-128 4x2",
 +#endif
 +#endif
 +#ifndef GMX_DOUBLE
 +    "AVX-256 4x8",
 +#else
 +    "AVX-256 4x4",
 +#endif
 +    "CUDA 8x8x8", "plain C 8x8x8" };
 +
 +static void pick_nbnxn_kernel(FILE *fp,
 +                              const t_commrec *cr,
 +                              const gmx_hw_info_t *hwinfo,
 +                              gmx_bool use_cpu_acceleration,
 +                              gmx_bool *bUseGPU,
 +                              int *kernel_type)
 +{
 +    gmx_bool bEmulateGPU, bGPU;
 +    char gpu_err_str[STRLEN];
 +
 +    assert(kernel_type);
 +
 +    *kernel_type = nbkNotSet;
 +    /* if bUseGPU == NULL we don't want a GPU (e.g. hybrid mode kernel selection) */
 +    bGPU = (bUseGPU != NULL) && hwinfo->bCanUseGPU;
 +
 +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined or in case if nobonded
 +       calculations are turned off via GMX_NO_NONBONDED -- this is the simple way
 +       to turn off GPU/CUDA initializations as well.. */
 +    bEmulateGPU = ((getenv("GMX_EMULATE_GPU") != NULL) ||
 +                   (getenv("GMX_NO_NONBONDED") != NULL));
 +
 +    if (bGPU)
 +    {
 +        if (bEmulateGPU)
 +        {
 +            bGPU = FALSE;
 +        }
 +        else
 +        {
 +            /* Each PP node will use the intra-node id-th device from the
 +             * list of detected/selected GPUs. */ 
 +            if (!init_gpu(cr->nodeid_group_intra, gpu_err_str, &hwinfo->gpu_info))
 +            {
 +                /* At this point the init should never fail as we made sure that 
 +                 * we have all the GPUs we need. If it still does, we'll bail. */
 +                gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
 +                          cr->nodeid,
 +                          get_gpu_device_id(&hwinfo->gpu_info, cr->nodeid_group_intra),
 +                          gpu_err_str);
 +            }
 +        }
 +        *bUseGPU = bGPU;
 +    }
 +
 +    if (bEmulateGPU)
 +    {
 +        *kernel_type = nbk8x8x8_PlainC;
 +
 +        md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
 +    }
 +    else if (bGPU)
 +    {
 +        *kernel_type = nbk8x8x8_CUDA;
 +    }
 +
 +    if (*kernel_type == nbkNotSet)
 +    {
 +        if (use_cpu_acceleration)
 +        {
 +            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,kernel_type);
 +        }
 +        else
 +        {
 +            *kernel_type = nbk4x4_PlainC;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Using %s non-bonded kernels\n",
 +                    nbk_name[*kernel_type]);
 +        }
 +        fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
 +                nbk_name[*kernel_type]);
 +    }
 +}
 +
 +
 +static void init_verlet_ewald_f_table(interaction_const_t *ic,
 +                                      int verlet_kernel_type)
 +{
 +    if (nbnxn_kernel_pairlist_simple(verlet_kernel_type))
 +    {
 +        /* With a spacing of 0.0005 we are at the force summation accuracy
 +         * for the SSE kernels for "normal" atomistic simulations.
 +         */
 +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
 +                                                   ic->rcoulomb);
 +        ic->tabq_size  = (int)(ic->rcoulomb*ic->tabq_scale) + 2;
 +#ifndef GMX_DOUBLE
 +        ic->tabq_format = tableformatFDV0;
 +#else
 +        ic->tabq_format = tableformatF;
 +#endif
 +    }
 +    else
 +    {
 +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
 +        if (verlet_kernel_type == nbk8x8x8_CUDA)
 +        {
 +            /* This case is handled in the nbnxn CUDA module */
 +            ic->tabq_format = tableformatNONE;
 +        }
 +        else
 +        {
 +            ic->tabq_format = tableformatF;
 +        }
 +    }
 +
 +    switch (ic->tabq_format)
 +    {
 +    case tableformatNONE:
 +        break;
 +    case tableformatF:
 +        sfree_aligned(ic->tabq_coul_F);
 +        sfree_aligned(ic->tabq_coul_V);
 +        snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
 +        snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
 +        table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,
 +                                    ic->tabq_size,ic->tabq_format,
 +                                    1/ic->tabq_scale,ic->ewaldcoeff);
 +        break;
 +    case tableformatFDV0:
 +        sfree_aligned(ic->tabq_coul_F);
 +        snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
 +        table_spline3_fill_ewald_lr(ic->tabq_coul_FDV0,NULL,
 +                                    ic->tabq_size,ic->tabq_format,
 +                                    1/ic->tabq_scale,ic->ewaldcoeff);
 +        break;
 +    default:
 +        gmx_incons("Unknown table format");
 +    }
 +}
 +
 +void init_interaction_const_tables(FILE *fp, 
 +                                   interaction_const_t *ic,
 +                                   int verlet_kernel_type)
 +{
 +    real spacing;
 +
 +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
 +    {
 +        init_verlet_ewald_f_table(ic,verlet_kernel_type);
 +
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
 +                    1/ic->tabq_scale,ic->tabq_size);
 +        }
 +    }
 +}
 +
 +void init_interaction_const(FILE *fp, 
 +                            interaction_const_t **interaction_const,
 +                            const t_forcerec *fr)
 +{
 +    interaction_const_t *ic;
 +
 +    snew(ic, 1);
 +
 +    ic->rlist       = fr->rlist;
 +
 +    /* Lennard-Jones */
 +    ic->rvdw        = fr->rvdw;
 +    if (fr->vdw_pot_shift)
 +    {
 +        ic->sh_invrc6 = pow(ic->rvdw,-6.0);
 +    }
 +    else
 +    {
 +        ic->sh_invrc6 = 0;
 +    }
 +
 +    /* Electrostatics */
 +    ic->eeltype     = fr->eeltype;
 +    ic->rcoulomb    = fr->rcoulomb;
 +    ic->epsilon_r   = fr->epsilon_r;
 +    ic->epsfac      = fr->epsfac;
 +
 +    /* Ewald */
 +    ic->ewaldcoeff  = fr->ewaldcoeff;
 +    if (fr->coul_pot_shift)
 +    {
 +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
 +    }
 +    else
 +    {
 +        ic->sh_ewald = 0;
 +    }
 +
 +    /* Reaction-field */
 +    if (EEL_RF(ic->eeltype))
 +    {
 +        ic->epsilon_rf = fr->epsilon_rf;
 +        ic->k_rf       = fr->k_rf;
 +        ic->c_rf       = fr->c_rf;
 +    }
 +    else
 +    {
 +        /* For plain cut-off we might use the reaction-field kernels */
 +        ic->epsilon_rf = ic->epsilon_r;
 +        ic->k_rf       = 0;
 +        if (fr->coul_pot_shift)
 +        {
 +            ic->c_rf   = 1/ic->rcoulomb;
 +        }
 +        else
 +        {
 +            ic->c_rf   = 0;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"Potential shift: LJ r^-12: %.3f r^-6 %.3f",
 +                sqr(ic->sh_invrc6),ic->sh_invrc6);
 +        if (ic->eeltype == eelCUT)
 +        {
 +            fprintf(fp,", Coulomb %.3f",ic->c_rf);
 +        }
 +        else if (EEL_PME(ic->eeltype))
 +        {
 +            fprintf(fp,", Ewald %.3e",ic->sh_ewald);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +
 +    *interaction_const = ic;
 +
 +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
 +    }
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        assert(fr->nbv != NULL && fr->nbv->grp != NULL);
 +        init_interaction_const_tables(fp,ic,fr->nbv->grp[fr->nbv->ngrp-1].kernel_type);
 +    }
 +}
 +
 +static void init_nb_verlet(FILE *fp,
 +                           nonbonded_verlet_t **nb_verlet,
 +                           const t_inputrec *ir,
 +                           const t_forcerec *fr,
 +                           const t_commrec *cr,
 +                           const char *nbpu_opt)
 +{
 +    nonbonded_verlet_t *nbv;
 +    int  i;
 +    char *env;
 +    gmx_bool bHybridGPURun = FALSE;
 +
++    nbnxn_alloc_t *nb_alloc;
++    nbnxn_free_t  *nb_free;
 +
 +    snew(nbv, 1);
 +
 +    nbv->nbs = NULL;
 +
 +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        nbv->grp[i].nbl_lists.nnbl = 0;
 +        nbv->grp[i].nbat           = NULL;
 +        nbv->grp[i].kernel_type    = nbkNotSet;
 +
 +        if (i == 0) /* local */
 +        {
 +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                              &nbv->bUseGPU,
 +                              &nbv->grp[i].kernel_type);
 +        }
 +        else /* non-local */
 +        {
 +            if (nbpu_opt != NULL && strcmp(nbpu_opt,"gpu_cpu") == 0)
 +            {
 +                /* Use GPU for local, select a CPU kernel for non-local */
 +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                                  NULL,
 +                                  &nbv->grp[i].kernel_type);
 +
 +                bHybridGPURun = TRUE;
 +            }
 +            else
 +            {
 +                /* Use the same kernel for local and non-local interactions */
 +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
 +            }
 +        }
 +    }
 +
 +    if (nbv->bUseGPU)
 +    {
 +        /* init the NxN GPU data; the last argument tells whether we'll have
 +         * both local and non-local NB calculation on GPU */
 +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
 +                        &fr->hwinfo->gpu_info, cr->nodeid_group_intra,
 +                        (nbv->ngrp > 1) && !bHybridGPURun);
 +
 +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
 +        {
 +            char *end;
 +
 +            nbv->min_ci_balanced = strtol(env, &end, 10);
 +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
 +            {
 +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", 
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +        else
 +        {
 +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nbv->min_ci_balanced = 0;
 +    }
 +
 +    *nb_verlet = nbv;
 +
 +    nbnxn_init_search(&nbv->nbs,
 +                      DOMAINDECOMP(cr) ? & cr->dd->nc : NULL,
 +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
 +                      gmx_omp_nthreads_get(emntNonbonded));
 +
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
 +        {
 +            nb_alloc = &pmalloc;
 +            nb_free  = &pfree;
 +        }
 +        else
 +        {
 +            nb_alloc = NULL;
 +            nb_free  = NULL;
 +        }
 +
 +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                /* 8x8x8 "non-simple" lists are ATM always combined */
 +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                nb_alloc, nb_free);
 +
 +        if (i == 0 ||
 +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
 +        {
 +            snew(nbv->grp[i].nbat,1);
 +            nbnxn_atomdata_init(fp,
 +                                nbv->grp[i].nbat,
 +                                nbv->grp[i].kernel_type,
 +                                fr->ntype,fr->nbfp,
 +                                ir->opts.ngener,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
 +                                nb_alloc, nb_free);
 +        }
 +        else
 +        {
 +            nbv->grp[i].nbat = nbv->grp[0].nbat;
 +        }
 +    }
 +}
 +
 +void init_forcerec(FILE *fp,
 +                   const output_env_t oenv,
 +                   t_forcerec *fr,
 +                   t_fcdata   *fcd,
 +                   const t_inputrec *ir,
 +                   const gmx_mtop_t *mtop,
 +                   const t_commrec  *cr,
 +                   matrix     box,
 +                   gmx_bool       bMolEpot,
 +                   const char *tabfn,
 +                   const char *tabafn,
 +                   const char *tabpfn,
 +                   const char *tabbfn,
 +                   const char *nbpu_opt,
 +                   gmx_bool   bNoSolvOpt,
 +                   real       print_force)
 +{
 +    int     i,j,m,natoms,ngrp,negp_pp,negptable,egi,egj;
 +    real    rtab;
 +    char    *env;
 +    double  dbl;
 +    rvec    box_size;
 +    const t_block *cgs;
 +    gmx_bool    bGenericKernelOnly;
 +    gmx_bool    bTab,bSep14tab,bNormalnblists;
 +    t_nblists *nbl;
 +    int     *nm_ind,egp_flags;
 +    
 +    /* By default we turn acceleration on, but it might be turned off further down... */
 +    fr->use_cpu_acceleration = TRUE;
 +
 +    fr->bDomDec = DOMAINDECOMP(cr);
 +
 +    natoms = mtop->natoms;
 +
 +    if (check_box(ir->ePBC,box))
 +    {
 +        gmx_fatal(FARGS,check_box(ir->ePBC,box));
 +    }
 +    
 +    /* Test particle insertion ? */
 +    if (EI_TPI(ir->eI)) {
 +        /* Set to the size of the molecule to be inserted (the last one) */
 +        /* Because of old style topologies, we have to use the last cg
 +         * instead of the last molecule type.
 +         */
 +        cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
 +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
 +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1]) {
 +            gmx_fatal(FARGS,"The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
 +        }
 +    } else {
 +        fr->n_tpi = 0;
 +    }
 +    
 +    /* Copy AdResS parameters */
 +    if (ir->bAdress) {
 +      fr->adress_type     = ir->adress->type;
 +      fr->adress_const_wf = ir->adress->const_wf;
 +      fr->adress_ex_width = ir->adress->ex_width;
 +      fr->adress_hy_width = ir->adress->hy_width;
 +      fr->adress_icor     = ir->adress->icor;
 +      fr->adress_site     = ir->adress->site;
 +      fr->adress_ex_forcecap = ir->adress->ex_forcecap;
 +      fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
 +
 +
 +      snew(fr->adress_group_explicit , ir->adress->n_energy_grps);
 +      for (i=0; i< ir->adress->n_energy_grps; i++){
 +          fr->adress_group_explicit[i]= ir->adress->group_explicit[i];
 +      }
 +
 +      fr->n_adress_tf_grps = ir->adress->n_tf_grps;
 +      snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
 +      for (i=0; i< fr->n_adress_tf_grps; i++){
 +          fr->adress_tf_table_index[i]= ir->adress->tf_table_index[i];
 +      }
 +      copy_rvec(ir->adress->refs,fr->adress_refs);
 +    } else {
 +      fr->adress_type = eAdressOff;
 +      fr->adress_do_hybridpairs = FALSE;
 +    }
 +    
 +    /* Copy the user determined parameters */
 +    fr->userint1 = ir->userint1;
 +    fr->userint2 = ir->userint2;
 +    fr->userint3 = ir->userint3;
 +    fr->userint4 = ir->userint4;
 +    fr->userreal1 = ir->userreal1;
 +    fr->userreal2 = ir->userreal2;
 +    fr->userreal3 = ir->userreal3;
 +    fr->userreal4 = ir->userreal4;
 +    
 +    /* Shell stuff */
 +    fr->fc_stepsize = ir->fc_stepsize;
 +    
 +    /* Free energy */
 +    fr->efep       = ir->efep;
 +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
 +    if (ir->fepvals->bScCoul)
 +    {
 +        fr->sc_alphacoul = ir->fepvals->sc_alpha;
 +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min,6);
 +    }
 +    else
 +    {
 +        fr->sc_alphacoul = 0;
 +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
 +    }
 +    fr->sc_power   = ir->fepvals->sc_power;
 +    fr->sc_r_power   = ir->fepvals->sc_r_power;
 +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma,6);
 +
 +    env = getenv("GMX_SCSIGMA_MIN");
 +    if (env != NULL)
 +    {
 +        dbl = 0;
 +        sscanf(env,"%lf",&dbl);
 +        fr->sc_sigma6_min = pow(dbl,6);
 +        if (fp)
 +        {
 +            fprintf(fp,"Setting the minimum soft core sigma to %g nm\n",dbl);
 +        }
 +    }
 +
 +    fr->bNonbonded = TRUE;
 +    if (getenv("GMX_NO_NONBONDED") != NULL)
 +    {
 +        /* turn off non-bonded calculations */
 +        fr->bNonbonded = FALSE;
 +        md_print_warn(cr,fp,
 +                      "Found environment variable GMX_NO_NONBONDED.\n"
 +                      "Disabling nonbonded calculations.\n");
 +    }
 +
 +    bGenericKernelOnly = FALSE;
 +    if (getenv("GMX_NB_GENERIC") != NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "Found environment variable GMX_NB_GENERIC.\n"
 +                    "Disabling interaction-specific nonbonded kernels.\n\n");
 +        }
 +        bGenericKernelOnly = TRUE;
 +        bNoSolvOpt         = TRUE;
 +    }
 +
 +    if( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
 +    {
 +        fr->use_cpu_acceleration = FALSE;
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
 +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
 +        }
 +    }
 +
 +    /* Check if we can/should do all-vs-all kernels */
 +    fr->bAllvsAll       = can_use_allvsall(ir,mtop,FALSE,NULL,NULL);
 +    fr->AllvsAll_work   = NULL;
 +    fr->AllvsAll_workgb = NULL;
 +
 +
 +    /* Neighbour searching stuff */
 +    fr->cutoff_scheme = ir->cutoff_scheme;
 +    fr->bGrid         = (ir->ns_type == ensGRID);
 +    fr->ePBC          = ir->ePBC;
 +
 +    /* Determine if we will do PBC for distances in bonded interactions */
 +    if (fr->ePBC == epbcNONE)
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* The group cut-off scheme and SHAKE assume charge groups
 +             * are whole, but not using molpbc is faster in most cases.
 +             */
 +            if (fr->cutoff_scheme == ecutsGROUP ||
 +                (ir->eConstrAlg == econtSHAKE &&
 +                 (gmx_mtop_ftype_count(mtop,F_CONSTR) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0)))
 +            {
 +                fr->bMolPBC = ir->bPeriodicMols;
 +            }
 +            else
 +            {
 +                fr->bMolPBC = TRUE;
 +                if (getenv("GMX_USE_GRAPH") != NULL)
 +                {
 +                    fr->bMolPBC = FALSE;
 +                    if (fp)
 +                    {
 +                        fprintf(fp,"\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC);
 +        }
 +    }
 +    fr->rc_scaling = ir->refcoord_scaling;
 +    copy_rvec(ir->posres_com,fr->posres_com);
 +    copy_rvec(ir->posres_comB,fr->posres_comB);
 +    fr->rlist      = cutoff_inf(ir->rlist);
 +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
 +    fr->eeltype    = ir->coulombtype;
 +    fr->vdwtype    = ir->vdwtype;
 +
 +    fr->coul_pot_shift = (ir->coulomb_modifier == eintmodPOTSHIFT);
 +    fr->vdw_pot_shift  = (ir->vdw_modifier     == eintmodPOTSHIFT);
 +    
 +    fr->bTwinRange = fr->rlistlong > fr->rlist;
 +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype==eelEWALD);
 +    
 +    fr->reppow     = mtop->ffparams.reppow;
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
 +                          !gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS));
 +        fr->bcoultab   = (!(fr->eeltype == eelCUT || EEL_RF(fr->eeltype)) ||
 +                          fr->eeltype == eelRF_ZERO);
 +
 +        if (getenv("GMX_REQUIRE_TABLES"))
 +        {
 +            fr->bvdwtab  = TRUE;
 +            fr->bcoultab = TRUE;
 +        }
 +
 +        if (fp)
 +        {
 +            fprintf(fp,"Table routines are used for coulomb: %s\n",bool_names[fr->bcoultab]);
 +            fprintf(fp,"Table routines are used for vdw:     %s\n",bool_names[fr->bvdwtab ]);
 +        }
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (!gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS))
 +        {
 +            gmx_fatal(FARGS,"Cut-off scheme %S only supports LJ repulsion power 12",ecutscheme_names[ir->cutoff_scheme]);
 +        }
 +        fr->bvdwtab  = FALSE;
 +        fr->bcoultab = FALSE;
 +    }
 +    
 +    /* Tables are used for direct ewald sum */
 +    if(fr->bEwald)
 +    {
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (fp)
 +                fprintf(fp,"Will do PME sum in reciprocal space.\n");
 +            if (ir->coulombtype == eelP3M_AD)
 +            {
 +                please_cite(fp,"Hockney1988");
 +                please_cite(fp,"Ballenegger2012");
 +            }
 +            else
 +            {
 +                please_cite(fp,"Essmann95a");
 +            }
 +            
 +            if (ir->ewald_geometry == eewg3DC)
 +            {
 +                if (fp)
 +                {
 +                    fprintf(fp,"Using the Ewald3DC correction for systems with a slab geometry.\n");
 +                }
 +                please_cite(fp,"In-Chul99a");
 +            }
 +        }
 +        fr->ewaldcoeff=calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
 +        init_ewald_tab(&(fr->ewald_table), cr, ir, fp);
 +        if (fp)
 +        {
 +            fprintf(fp,"Using a Gaussian width (1/beta) of %g nm for Ewald\n",
 +                    1/fr->ewaldcoeff);
 +        }
 +    }
 +    
 +    /* Electrostatics */
 +    fr->epsilon_r  = ir->epsilon_r;
 +    fr->epsilon_rf = ir->epsilon_rf;
 +    fr->fudgeQQ    = mtop->ffparams.fudgeQQ;
 +    fr->rcoulomb_switch = ir->rcoulomb_switch;
 +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
 +    
 +    /* Parameters for generalized RF */
 +    fr->zsquare = 0.0;
 +    fr->temp    = 0.0;
 +    
 +    if (fr->eeltype == eelGRF)
 +    {
 +        init_generalized_rf(fp,mtop,ir,fr);
 +    }
 +    else if (fr->eeltype == eelSHIFT)
 +    {
 +        for(m=0; (m<DIM); m++)
 +            box_size[m]=box[m][m];
 +        
 +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
 +            set_shift_consts(fp,fr->rcoulomb_switch,fr->rcoulomb,box_size,fr);
 +    }
 +    
 +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
 +                       gmx_mtop_ftype_count(mtop,F_POSRES) > 0 ||
 +                       gmx_mtop_ftype_count(mtop,F_FBPOSRES) > 0 ||
 +                       IR_ELEC_FIELD(*ir) ||
 +                       (fr->adress_icor != eAdressICOff)
 +                      );
 +    
 +    if (fr->cutoff_scheme == ecutsGROUP &&
 +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) {
 +        /* Count the total number of charge groups */
 +        fr->cg_nalloc = ncg_mtop(mtop);
 +        srenew(fr->cg_cm,fr->cg_nalloc);
 +    }
 +    if (fr->shift_vec == NULL)
 +        snew(fr->shift_vec,SHIFTS);
 +    
 +    if (fr->fshift == NULL)
 +        snew(fr->fshift,SHIFTS);
 +    
 +    if (fr->nbfp == NULL) {
 +        fr->ntype = mtop->ffparams.atnr;
 +        fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +        fr->nbfp  = mk_nbfp(&mtop->ffparams,fr->bBHAM);
 +    }
 +    
 +    /* Copy the energy group exclusions */
 +    fr->egp_flags = ir->opts.egp_flags;
 +    
 +    /* Van der Waals stuff */
 +    fr->rvdw        = cutoff_inf(ir->rvdw);
 +    fr->rvdw_switch = ir->rvdw_switch;
 +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM) {
 +        if (fr->rvdw_switch >= fr->rvdw)
 +            gmx_fatal(FARGS,"rvdw_switch (%f) must be < rvdw (%f)",
 +                      fr->rvdw_switch,fr->rvdw);
 +        if (fp)
 +            fprintf(fp,"Using %s Lennard-Jones, switch between %g and %g nm\n",
 +                    (fr->eeltype==eelSWITCH) ? "switched":"shifted",
 +                    fr->rvdw_switch,fr->rvdw);
 +    } 
 +    
 +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
 +        gmx_fatal(FARGS,"Switch/shift interaction not supported with Buckingham");
 +    
 +    if (fp)
 +        fprintf(fp,"Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
 +                fr->rlist,fr->rcoulomb,fr->bBHAM ? "BHAM":"LJ",fr->rvdw);
 +    
 +    fr->eDispCorr = ir->eDispCorr;
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        set_avcsixtwelve(fp,fr,mtop);
 +    }
 +    
 +    if (fr->bBHAM)
 +    {
 +        set_bham_b_max(fp,fr,mtop);
 +    }
 +
 +    fr->bGB = (ir->implicit_solvent == eisGBSA);
 +      fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +
 +    /* Copy the GBSA data (radius, volume and surftens for each
 +     * atomtype) from the topology atomtype section to forcerec.
 +     */
 +    snew(fr->atype_radius,fr->ntype);
 +    snew(fr->atype_vol,fr->ntype);
 +    snew(fr->atype_surftens,fr->ntype);
 +    snew(fr->atype_gb_radius,fr->ntype);
 +    snew(fr->atype_S_hct,fr->ntype);
 +
 +    if (mtop->atomtypes.nr > 0)
 +    {
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_radius[i] =mtop->atomtypes.radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
 +    }  
 +      
 +      /* Generate the GB table if needed */
 +      if(fr->bGB)
 +      {
 +#ifdef GMX_DOUBLE
 +              fr->gbtabscale=2000;
 +#else
 +              fr->gbtabscale=500;
 +#endif
 +              
 +              fr->gbtabr=100;
 +              fr->gbtab=make_gb_table(fp,oenv,fr,tabpfn,fr->gbtabscale);
 +
 +        init_gb(&fr->born,cr,fr,ir,mtop,ir->rgbradii,ir->gb_algorithm);
 +
 +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            make_local_gb(cr,fr->born,ir->gb_algorithm);
 +        }
 +    }
 +
 +    /* Set the charge scaling */
 +    if (fr->epsilon_r != 0)
 +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
 +    else
 +        /* eps = 0 is infinite dieletric: no coulomb interactions */
 +        fr->epsfac = 0;
 +    
 +    /* Reaction field constants */
 +    if (EEL_RF(fr->eeltype))
 +        calc_rffac(fp,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    
 +    set_chargesum(fp,fr,mtop);
 +    
 +    /* if we are using LR electrostatics, and they are tabulated,
 +     * the tables will contain modified coulomb interactions.
 +     * Since we want to use the non-shifted ones for 1-4
 +     * coulombic interactions, we must have an extra set of tables.
 +     */
 +    
 +    /* Construct tables.
 +     * A little unnecessary to make both vdw and coul tables sometimes,
 +     * but what the heck... */
 +    
 +    bTab = fr->bcoultab || fr->bvdwtab;
 +
 +    bSep14tab = ((!bTab || fr->eeltype!=eelCUT || fr->vdwtype!=evdwCUT ||
 +                  fr->bBHAM) &&
 +                 (gmx_mtop_ftype_count(mtop,F_LJ14) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC14_Q) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC_PAIRS_NB) > 0));
 +
 +    negp_pp = ir->opts.ngener - ir->nwall;
 +    negptable = 0;
 +    if (!bTab) {
 +        bNormalnblists = TRUE;
 +        fr->nnblists = 1;
 +    } else {
 +        bNormalnblists = (ir->eDispCorr != edispcNO);
 +        for(egi=0; egi<negp_pp; egi++) {
 +            for(egj=egi;  egj<negp_pp; egj++) {
 +                egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                if (!(egp_flags & EGP_EXCL)) {
 +                    if (egp_flags & EGP_TABLE) {
 +                        negptable++;
 +                    } else {
 +                        bNormalnblists = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (bNormalnblists) {
 +            fr->nnblists = negptable + 1;
 +        } else {
 +            fr->nnblists = negptable;
 +        }
 +        if (fr->nnblists > 1)
 +            snew(fr->gid2nblists,ir->opts.ngener*ir->opts.ngener);
 +    }
 +    snew(fr->nblists,fr->nnblists);
 +    
 +    /* This code automatically gives table length tabext without cut-off's,
 +     * in that case grompp should already have checked that we do not need
 +     * normal tables and we only generate tables for 1-4 interactions.
 +     */
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (bTab) {
 +        /* make tables for ordinary interactions */
 +        if (bNormalnblists) {
 +            make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[0]);
 +            if (!bSep14tab)
 +                fr->tab14 = fr->nblists[0].tab;
 +            m = 1;
 +        } else {
 +            m = 0;
 +        }
 +        if (negptable > 0) {
 +            /* Read the special tables for certain energy group pairs */
 +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
 +            for(egi=0; egi<negp_pp; egi++) {
 +                for(egj=egi;  egj<negp_pp; egj++) {
 +                    egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL)) {
 +                        nbl = &(fr->nblists[m]);
 +                        if (fr->nnblists > 1) {
 +                            fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = m;
 +                        }
 +                        /* Read the table file with the two energy groups names appended */
 +                        make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[m]);
 +                        m++;
 +                    } else if (fr->nnblists > 1) {
 +                        fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = 0;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (bSep14tab)
 +    {
 +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
 +        fr->tab14 = make_tables(fp,oenv,fr,MASTER(cr),tabpfn,rtab,
 +                                GMX_MAKETABLES_14ONLY);
 +    }
 +
 +    /* Read AdResS Thermo Force table if needed */
 +    if(fr->adress_icor == eAdressICThermoForce)
 +    {
 +        /* old todo replace */ 
 +        
 +        if (ir->adress->n_tf_grps > 0){
 +            make_adress_tf_tables(fp,oenv,fr,ir,tabfn, mtop, box);
 +
 +        }else{
 +            /* load the default table */
 +            snew(fr->atf_tabs, 1);
 +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp,oenv,fr,tabafn, box);
 +        }
 +    }
 +    
 +    /* Wall stuff */
 +    fr->nwall = ir->nwall;
 +    if (ir->nwall && ir->wall_type==ewtTABLE)
 +    {
 +        make_wall_tables(fp,oenv,ir,tabfn,&mtop->groups,fr);
 +    }
 +    
 +    if (fcd && tabbfn) {
 +        fcd->bondtab  = make_bonded_tables(fp,
 +                                           F_TABBONDS,F_TABBONDSNC,
 +                                           mtop,tabbfn,"b");
 +        fcd->angletab = make_bonded_tables(fp,
 +                                           F_TABANGLES,-1,
 +                                           mtop,tabbfn,"a");
 +        fcd->dihtab   = make_bonded_tables(fp,
 +                                           F_TABDIHS,-1,
 +                                           mtop,tabbfn,"d");
 +    } else {
 +        if (debug)
 +            fprintf(debug,"No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
 +    }
 +    
 +    /* QM/MM initialization if requested
 +     */
 +    if (ir->bQMMM)
 +    {
 +        fprintf(stderr,"QM/MM calculation requested.\n");
 +    }
 +    
 +    fr->bQMMM      = ir->bQMMM;   
 +    fr->qr         = mk_QMMMrec();
 +    
 +    /* Set all the static charge group info */
 +    fr->cginfo_mb = init_cginfo_mb(fp,mtop,fr,bNoSolvOpt,
 +                                   &fr->bExcl_IntraCGAll_InterCGNone);
 +    if (DOMAINDECOMP(cr)) {
 +        fr->cginfo = NULL;
 +    } else {
 +        fr->cginfo = cginfo_expand(mtop->nmolblock,fr->cginfo_mb);
 +    }
 +    
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* When using particle decomposition, the effect of the second argument,
 +         * which sets fr->hcg, is corrected later in do_md and init_em.
 +         */
 +        forcerec_set_ranges(fr,ncg_mtop(mtop),ncg_mtop(mtop),
 +                            mtop->natoms,mtop->natoms,mtop->natoms);
 +    }
 +    
 +    fr->print_force = print_force;
 +
 +
 +    /* coarse load balancing vars */
 +    fr->t_fnbf=0.;
 +    fr->t_wait=0.;
 +    fr->timesteps=0;
 +    
 +    /* Initialize neighbor search */
 +    init_ns(fp,cr,&fr->ns,fr,mtop,box);
 +    
 +    if (cr->duty & DUTY_PP)
 +    {
 +        gmx_setup_kernels(fp,fr,bGenericKernelOnly);
 +        if (ir->bAdress)
 +        {
 +            gmx_setup_adress_kernels(fp,bGenericKernelOnly);
 +        }
 +    }
 +
 +    /* Initialize the thread working data for bonded interactions */
 +    init_forcerec_f_threads(fr,mtop->groups.grps[egcENER].nr);
 +    
 +    snew(fr->excl_load,fr->nthreads+1);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            gmx_fatal(FARGS,"With Verlet lists rcoulomb and rvdw should be identical");
 +        }
 +
 +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
 +
 +        /* initialize interaction constants
 +         * TODO should be moved out during modularization.
 +         */
 +        init_interaction_const(fp, &fr->ic, fr);
 +    }
 +
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        calc_enervirdiff(fp,ir->eDispCorr,fr);
 +    }
 +}
 +
 +#define pr_real(fp,r) fprintf(fp,"%s: %e\n",#r,r)
 +#define pr_int(fp,i)  fprintf((fp),"%s: %d\n",#i,i)
 +#define pr_bool(fp,b) fprintf((fp),"%s: %s\n",#b,bool_names[b])
 +
 +void pr_forcerec(FILE *fp,t_forcerec *fr,t_commrec *cr)
 +{
 +  int i;
 +
 +  pr_real(fp,fr->rlist);
 +  pr_real(fp,fr->rcoulomb);
 +  pr_real(fp,fr->fudgeQQ);
 +  pr_bool(fp,fr->bGrid);
 +  pr_bool(fp,fr->bTwinRange);
 +  /*pr_int(fp,fr->cg0);
 +    pr_int(fp,fr->hcg);*/
 +  for(i=0; i<fr->nnblists; i++)
 +    pr_int(fp,fr->nblists[i].tab.n);
 +  pr_real(fp,fr->rcoulomb_switch);
 +  pr_real(fp,fr->rcoulomb);
 +  
 +  fflush(fp);
 +}
 +
 +void forcerec_set_excl_load(t_forcerec *fr,
 +                            const gmx_localtop_t *top,const t_commrec *cr)
 +{
 +    const int *ind,*a;
 +    int t,i,j,ntot,n,ntarget;
 +
 +    if (cr != NULL && PARTDECOMP(cr))
 +    {
 +        /* No OpenMP with particle decomposition */
 +        pd_at_range(cr,
 +                    &fr->excl_load[0],
 +                    &fr->excl_load[1]);
 +
 +        return;
 +    }
 +
 +    ind = top->excls.index;
 +    a   = top->excls.a;
 +
 +    ntot = 0;
 +    for(i=0; i<top->excls.nr; i++)
 +    {
 +        for(j=ind[i]; j<ind[i+1]; j++)
 +        {
 +            if (a[j] > i)
 +            {
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    fr->excl_load[0] = 0;
 +    n = 0;
 +    i = 0;
 +    for(t=1; t<=fr->nthreads; t++)
 +    {
 +        ntarget = (ntot*t)/fr->nthreads;
 +        while(i < top->excls.nr && n < ntarget)
 +        {
 +            for(j=ind[i]; j<ind[i+1]; j++)
 +            {
 +                if (a[j] > i)
 +                {
 +                    n++;
 +                }
 +            }
 +            i++;
 +        }
 +        fr->excl_load[t] = i;
 +    }
 +}
 +
Simple merge
index 0000000000000000000000000000000000000000,bd83a96589c3315c37d7458bb4d8a5c4923ec3f2..bd83a96589c3315c37d7458bb4d8a5c4923ec3f2
mode 000000,100644..100644
--- /dev/null
index 0000000000000000000000000000000000000000,f3bd1b7fa2b34ac6969b8942e0eab4fc9dcc9308..f3bd1b7fa2b34ac6969b8942e0eab4fc9dcc9308
mode 000000,100644..100644
--- /dev/null
Simple merge
index 0000000000000000000000000000000000000000,289d5abb004a5096aac95dbd5c8b4b4f10c17433..289d5abb004a5096aac95dbd5c8b4b4f10c17433
mode 000000,100644..100644
--- /dev/null
Simple merge
Simple merge
Simple merge
index 3f70f25392de8e9ae5a63dde8af5d3aed2359902,0000000000000000000000000000000000000000..01a1e3b2e8525fc135e9166dd807ac1446ce57de
mode 100644,000000..100644
--- /dev/null
@@@ -1,2614 -1,0 +1,2615 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_CRAY_XT3
 +#include<catamount/dclock.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <time.h>
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +#include <math.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "gmxfio.h"
 +#include "smalloc.h"
 +#include "names.h"
 +#include "confio.h"
 +#include "mvdata.h"
 +#include "txtdump.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "vec.h"
 +#include <time.h>
 +#include "nrnb.h"
 +#include "mshift.h"
 +#include "mdrun.h"
 +#include "sim_util.h"
 +#include "update.h"
 +#include "physics.h"
 +#include "main.h"
 +#include "mdatoms.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "pme.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "network.h"
 +#include "calcmu.h"
 +#include "constr.h"
 +#include "xvgr.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "copyrite.h"
 +#include "pull_rotation.h"
 +#include "gmx_random.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "gmx_wallcycle.h"
 +#include "genborn.h"
++#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_kernels/nbnxn_kernel_ref.h"
 +#include "nbnxn_kernels/nbnxn_kernel_x86_simd128.h"
 +#include "nbnxn_kernels/nbnxn_kernel_x86_simd256.h"
 +#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#include "adress.h"
 +#include "qmmm.h"
 +
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "nbnxn_cuda/nbnxn_cuda.h"
 +
 +#if 0
 +typedef struct gmx_timeprint {
 +
 +} t_gmx_timeprint;
 +#endif
 +
 +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n);
 +
 +
 +double
 +gmx_gettime()
 +{
 +#ifdef HAVE_GETTIMEOFDAY
 +      struct timeval t;
 +      double seconds;
 +
 +      gettimeofday(&t,NULL);
 +
 +      seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
 +
 +      return seconds;
 +#else
 +      double  seconds;
 +
 +      seconds = time(NULL);
 +
 +      return seconds;
 +#endif
 +}
 +
 +
 +#define difftime(end,start) ((double)(end)-(double)(start))
 +
 +void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,
 +                t_inputrec *ir, t_commrec *cr)
 +{
 +    time_t finish;
 +    char   timebuf[STRLEN];
 +    double dt;
 +    char buf[48];
 +
 +#ifndef GMX_THREAD_MPI
 +    if (!PAR(cr))
 +#endif
 +    {
 +        fprintf(out,"\r");
 +    }
 +    fprintf(out,"step %s",gmx_step_str(step,buf));
 +    if ((step >= ir->nstlist))
 +    {
 +        runtime->last = gmx_gettime();
 +        dt = difftime(runtime->last,runtime->real);
 +        runtime->time_per_step = dt/(step - ir->init_step + 1);
 +
 +        dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
 +
 +        if (ir->nsteps >= 0)
 +        {
 +            if (dt >= 300)
 +            {
 +                finish = (time_t) (runtime->last + dt);
 +                gmx_ctime_r(&finish,timebuf,STRLEN);
 +                sprintf(buf,"%s",timebuf);
 +                buf[strlen(buf)-1]='\0';
 +                fprintf(out,", will finish %s",buf);
 +            }
 +            else
 +                fprintf(out,", remaining runtime: %5d s          ",(int)dt);
 +        }
 +        else
 +        {
 +            fprintf(out," performance: %.1f ns/day    ",
 +                    ir->delta_t/1000*24*60*60/runtime->time_per_step);
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        fprintf(out,"\n");
 +    }
 +#endif
 +
 +    fflush(out);
 +}
 +
 +#ifdef NO_CLOCK
 +#define clock() -1
 +#endif
 +
 +static double set_proctime(gmx_runtime_t *runtime)
 +{
 +    double diff;
 +#ifdef GMX_CRAY_XT3
 +    double prev;
 +
 +    prev = runtime->proc;
 +    runtime->proc = dclock();
 +
 +    diff = runtime->proc - prev;
 +#else
 +    clock_t prev;
 +
 +    prev = runtime->proc;
 +    runtime->proc = clock();
 +
 +    diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
 +#endif
 +    if (diff < 0)
 +    {
 +        /* The counter has probably looped, ignore this data */
 +        diff = 0;
 +    }
 +
 +    return diff;
 +}
 +
 +void runtime_start(gmx_runtime_t *runtime)
 +{
 +    runtime->real = gmx_gettime();
 +    runtime->proc          = 0;
 +    set_proctime(runtime);
 +    runtime->realtime      = 0;
 +    runtime->proctime      = 0;
 +    runtime->last          = 0;
 +    runtime->time_per_step = 0;
 +}
 +
 +void runtime_end(gmx_runtime_t *runtime)
 +{
 +    double now;
 +
 +    now = gmx_gettime();
 +
 +    runtime->proctime += set_proctime(runtime);
 +    runtime->realtime  = now - runtime->real;
 +    runtime->real      = now;
 +}
 +
 +void runtime_upd_proc(gmx_runtime_t *runtime)
 +{
 +    runtime->proctime += set_proctime(runtime);
 +}
 +
 +void print_date_and_time(FILE *fplog,int nodeid,const char *title,
 +                         const gmx_runtime_t *runtime)
 +{
 +    int i;
 +    char timebuf[STRLEN];
 +    char time_string[STRLEN];
 +    time_t tmptime;
 +
 +    if (fplog)
 +    {
 +        if (runtime != NULL)
 +        {
 +            tmptime = (time_t) runtime->real;
 +            gmx_ctime_r(&tmptime,timebuf,STRLEN);
 +        }
 +        else
 +        {
 +            tmptime = (time_t) gmx_gettime();
 +            gmx_ctime_r(&tmptime,timebuf,STRLEN);
 +        }
 +        for(i=0; timebuf[i]>=' '; i++)
 +        {
 +            time_string[i]=timebuf[i];
 +        }
 +        time_string[i]='\0';
 +
 +        fprintf(fplog,"%s on node %d %s\n",title,nodeid,time_string);
 +    }
 +}
 +
 +static void sum_forces(int start,int end,rvec f[],rvec flr[])
 +{
 +  int i;
 +
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"fsr",f+start,end-start);
 +    pr_rvecs(debug,0,"flr",flr+start,end-start);
 +  }
 +  for(i=start; (i<end); i++)
 +    rvec_inc(f[i],flr[i]);
 +}
 +
 +/*
 + * calc_f_el calculates forces due to an electric field.
 + *
 + * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
 + *
 + * Et[] contains the parameters for the time dependent
 + * part of the field (not yet used).
 + * Ex[] contains the parameters for
 + * the spatial dependent part of the field. You can have cool periodic
 + * fields in principle, but only a constant field is supported
 + * now.
 + * The function should return the energy due to the electric field
 + * (if any) but for now returns 0.
 + *
 + * WARNING:
 + * There can be problems with the virial.
 + * Since the field is not self-consistent this is unavoidable.
 + * For neutral molecules the virial is correct within this approximation.
 + * For neutral systems with many charged molecules the error is small.
 + * But for systems with a net charge or a few charged molecules
 + * the error can be significant when the field is high.
 + * Solution: implement a self-consitent electric field into PME.
 + */
 +static void calc_f_el(FILE *fp,int  start,int homenr,
 +                      real charge[],rvec x[],rvec f[],
 +                      t_cosines Ex[],t_cosines Et[],double t)
 +{
 +    rvec Ext;
 +    real t0;
 +    int  i,m;
 +
 +    for(m=0; (m<DIM); m++)
 +    {
 +        if (Et[m].n > 0)
 +        {
 +            if (Et[m].n == 3)
 +            {
 +                t0 = Et[m].a[1];
 +                Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
 +            }
 +            else
 +            {
 +                Ext[m] = cos(Et[m].a[0]*t);
 +            }
 +        }
 +        else
 +        {
 +            Ext[m] = 1.0;
 +        }
 +        if (Ex[m].n > 0)
 +        {
 +            /* Convert the field strength from V/nm to MD-units */
 +            Ext[m] *= Ex[m].a[0]*FIELDFAC;
 +            for(i=start; (i<start+homenr); i++)
 +                f[i][m] += charge[i]*Ext[m];
 +        }
 +        else
 +        {
 +            Ext[m] = 0;
 +        }
 +    }
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
 +                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
 +    }
 +}
 +
 +static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
 +                      tensor vir_part,t_graph *graph,matrix box,
 +                      t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
 +{
 +  int i,j;
 +  tensor virtest;
 +
 +  /* The short-range virial from surrounding boxes */
 +  clear_mat(vir_part);
 +  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
 +  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
 +
 +  /* Calculate partial virial, for local atoms only, based on short range.
 +   * Total virial is computed in global_stat, called from do_md
 +   */
 +  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
 +  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
 +
 +  /* Add position restraint contribution */
 +  for(i=0; i<DIM; i++) {
 +    vir_part[i][i] += fr->vir_diag_posres[i];
 +  }
 +
 +  /* Add wall contribution */
 +  for(i=0; i<DIM; i++) {
 +    vir_part[i][ZZ] += fr->vir_wall_z[i];
 +  }
 +
 +  if (debug)
 +    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
 +}
 +
 +static void posres_wrapper(FILE *fplog,
 +                           int flags,
 +                           gmx_bool bSepDVDL,
 +                           t_inputrec *ir,
 +                           t_nrnb *nrnb,
 +                           gmx_localtop_t *top,
 +                           matrix box,rvec x[],
 +                           rvec f[],
 +                           gmx_enerdata_t *enerd,
 +                           real *lambda,
 +                           t_forcerec *fr)
 +{
 +    t_pbc pbc;
 +    real  v,dvdl;
 +    int   i;
 +
 +    /* Position restraints always require full pbc */
 +    set_pbc(&pbc,ir->ePBC,box);
 +    dvdl = 0;
 +    v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
 +               top->idef.iparams_posres,
 +               (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
 +               ir->ePBC==epbcNONE ? NULL : &pbc,
 +               lambda[efptRESTRAINT],&dvdl,
 +               fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,sepdvdlformat,
 +                interaction_function[F_POSRES].longname,v,dvdl);
 +    }
 +    enerd->term[F_POSRES] += v;
 +    /* If just the force constant changes, the FEP term is linear,
 +     * but if k changes, it is not.
 +     */
 +    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
 +    inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
 +
 +    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
 +    {
 +        for(i=0; i<enerd->n_lambda; i++)
 +        {
 +            real dvdl_dum,lambda_dum;
 +
 +            lambda_dum = (i==0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
 +            v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
 +                       top->idef.iparams_posres,
 +                       (const rvec*)x,NULL,NULL,
 +                       ir->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl,
 +                       fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
 +            enerd->enerpart_lambda[i] += v;
 +        }
 +    }
 +}
 +
 +static void pull_potential_wrapper(FILE *fplog,
 +                                   gmx_bool bSepDVDL,
 +                                   t_commrec *cr,
 +                                   t_inputrec *ir,
 +                                   matrix box,rvec x[],
 +                                   rvec f[],
 +                                   tensor vir_force,
 +                                   t_mdatoms *mdatoms,
 +                                   gmx_enerdata_t *enerd,
 +                                   real *lambda,
 +                                   double t)
 +{
 +    t_pbc  pbc;
 +    real   dvdl;
 +
 +    /* Calculate the center of mass forces, this requires communication,
 +     * which is why pull_potential is called close to other communication.
 +     * The virial contribution is calculated directly,
 +     * which is why we call pull_potential after calc_virial.
 +     */
 +    set_pbc(&pbc,ir->ePBC,box);
 +    dvdl = 0; 
 +    enerd->term[F_COM_PULL] +=
 +        pull_potential(ir->ePull,ir->pull,mdatoms,&pbc,
 +                       cr,t,lambda[efptRESTRAINT],x,f,vir_force,&dvdl);
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
 +    }
 +    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
 +}
 +
 +static void pme_receive_force_ener(FILE *fplog,
 +                                   gmx_bool bSepDVDL,
 +                                   t_commrec *cr,
 +                                   gmx_wallcycle_t wcycle,
 +                                   gmx_enerdata_t *enerd,
 +                                   t_forcerec *fr)
 +{
 +    real   e,v,dvdl;    
 +    float  cycles_ppdpme,cycles_seppme;
 +
 +    cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
 +    dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
 +
 +    /* In case of node-splitting, the PP nodes receive the long-range 
 +     * forces, virial and energy from the PME nodes here.
 +     */    
 +    wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
 +    dvdl = 0;
 +    gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
 +                      &cycles_seppme);
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
 +    }
 +    enerd->term[F_COUL_RECIP] += e;
 +    enerd->dvdl_lin[efptCOUL] += dvdl;
 +    if (wcycle)
 +    {
 +        dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
 +    }
 +    wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
 +}
 +
 +static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
 +                             gmx_large_int_t step,real pforce,rvec *x,rvec *f)
 +{
 +  int  i;
 +  real pf2,fn2;
 +  char buf[STEPSTRSIZE];
 +
 +  pf2 = sqr(pforce);
 +  for(i=md->start; i<md->start+md->homenr; i++) {
 +    fn2 = norm2(f[i]);
 +    /* We also catch NAN, if the compiler does not optimize this away. */
 +    if (fn2 >= pf2 || fn2 != fn2) {
 +      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 +            gmx_step_str(step,buf),
 +            ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
 +    }
 +  }
 +}
 +
 +static void post_process_forces(FILE *fplog,
 +                                t_commrec *cr,
 +                                gmx_large_int_t step,
 +                                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                                gmx_localtop_t *top,
 +                                matrix box,rvec x[],
 +                                rvec f[],
 +                                tensor vir_force,
 +                                t_mdatoms *mdatoms,
 +                                t_graph *graph,
 +                                t_forcerec *fr,gmx_vsite_t *vsite,
 +                                int flags)
 +{
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite)
 +        {
 +            /* Spread the mesh force on virtual sites to the other particles... 
 +             * This is parallellized. MPI communication is performed
 +             * if the constructing atoms aren't local.
 +             */
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
 +                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
 +                           nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +        }
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Now add the forces, this is local */
 +            if (fr->bDomDec)
 +            {
 +                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
 +            }
 +            else
 +            {
 +                sum_forces(mdatoms->start,mdatoms->start+mdatoms->homenr,
 +                           f,fr->f_novirsum);
 +            }
 +            if (EEL_FULL(fr->eeltype))
 +            {
 +                /* Add the mesh contribution to the virial */
 +                m_add(vir_force,fr->vir_el_recip,vir_force);
 +            }
 +            if (debug)
 +            {
 +                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
 +            }
 +        }
 +    }
 +    
 +    if (fr->print_force >= 0)
 +    {
 +        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
 +    }
 +}
 +
 +static void do_nb_verlet(t_forcerec *fr,
 +                         interaction_const_t *ic,
 +                         gmx_enerdata_t *enerd,
 +                         int flags, int ilocality,
 +                         int clearF,
 +                         t_nrnb *nrnb,
 +                         gmx_wallcycle_t wcycle)
 +{
 +    int     nnbl, kernel_type, sh_e;
 +    char    *env;
 +    nonbonded_verlet_group_t  *nbvg;
 +
 +    if (!(flags & GMX_FORCE_NONBONDED))
 +    {
 +        /* skip non-bonded calculation */
 +        return;
 +    }
 +
 +    nbvg = &fr->nbv->grp[ilocality];
 +
 +    /* CUDA kernel launch overhead is already timed separately */
 +    if (fr->cutoff_scheme != ecutsVERLET)
 +    {
 +        gmx_incons("Invalid cut-off scheme passed!");
 +    }
 +
 +    if (nbvg->kernel_type != nbk8x8x8_CUDA)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
 +    }
 +    switch (nbvg->kernel_type)
 +    {
 +        case nbk4x4_PlainC:
 +            nbnxn_kernel_ref(&nbvg->nbl_lists,
 +                             nbvg->nbat, ic,
 +                             fr->shift_vec,
 +                             flags,
 +                             clearF,
 +                             fr->fshift[0],
 +                             enerd->grpp.ener[egCOULSR],
 +                             fr->bBHAM ?
 +                             enerd->grpp.ener[egBHAMSR] :
 +                             enerd->grpp.ener[egLJSR]);
 +            break;
 +        
 +        case nbk4xN_X86_SIMD128:
 +            nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
 +                                     nbvg->nbat, ic,
 +                                     fr->shift_vec,
 +                                     flags,
 +                                     clearF,
 +                                     fr->fshift[0],
 +                                     enerd->grpp.ener[egCOULSR],
 +                                     fr->bBHAM ?
 +                                     enerd->grpp.ener[egBHAMSR] :
 +                                     enerd->grpp.ener[egLJSR]);
 +            break;
 +        case nbk4xN_X86_SIMD256:
 +            nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
 +                                     nbvg->nbat, ic,
 +                                     fr->shift_vec,
 +                                     flags,
 +                                     clearF,
 +                                     fr->fshift[0],
 +                                     enerd->grpp.ener[egCOULSR],
 +                                     fr->bBHAM ?
 +                                     enerd->grpp.ener[egBHAMSR] :
 +                                     enerd->grpp.ener[egLJSR]);
 +            break;
 +
 +        case nbk8x8x8_CUDA:
 +            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
 +            break;
 +
 +        case nbk8x8x8_PlainC:
 +            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
 +                                 nbvg->nbat, ic,
 +                                 fr->shift_vec,
 +                                 flags,
 +                                 clearF,
 +                                 nbvg->nbat->out[0].f,
 +                                 fr->fshift[0],
 +                                 enerd->grpp.ener[egCOULSR],
 +                                 fr->bBHAM ?
 +                                 enerd->grpp.ener[egBHAMSR] :
 +                                 enerd->grpp.ener[egLJSR]);
 +            break;
 +
 +        default:
 +            gmx_incons("Invalid nonbonded kernel type passed!");
 +
 +    }
 +    if (nbvg->kernel_type != nbk8x8x8_CUDA)
 +    {
 +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
 +    }
 +
 +    /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 +    sh_e = ((flags & GMX_FORCE_ENERGY) ? 1 : 0);
 +    inc_nrnb(nrnb,
 +             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
 +              eNR_NBNXN_LJ_RF : eNR_NBNXN_LJ_TAB) + sh_e,
 +             nbvg->nbl_lists.natpair_ljq);
 +    inc_nrnb(nrnb,eNR_NBNXN_LJ+sh_e,nbvg->nbl_lists.natpair_lj);
 +    inc_nrnb(nrnb,
 +             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
 +              eNR_NBNXN_RF : eNR_NBNXN_TAB)+sh_e,
 +             nbvg->nbl_lists.natpair_q);
 +}
 +
 +void do_force_cutsVERLET(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real *lambda,t_graph *graph,
 +              t_forcerec *fr, interaction_const_t *ic,
 +              gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    int     cg0,cg1,i,j;
 +    int     start,homenr;
 +    int     nb_kernel_type;
 +    double  mu[2*DIM];
 +    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
 +    gmx_bool   bDoLongRange,bDoForces,bSepLRF,bUseGPU,bUseOrEmulGPU;
 +    gmx_bool   bDiffKernels=FALSE;
 +    matrix  boxs;
 +    rvec    vzero,box_diag;
 +    real    e,v,dvdl;
 +    float  cycles_pme,cycles_force;
 +    nonbonded_verlet_t *nbv;
 +
 +    cycles_force = 0;
 +    nbv = fr->nbv;
 +    nb_kernel_type = fr->nbv->grp[0].kernel_type;
 +
 +    start  = mdatoms->start;
 +    homenr = mdatoms->homenr;
 +
 +    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
 +
 +    clear_mat(vir_force);
 +
 +    cg0 = 0;
 +    if (DOMAINDECOMP(cr))
 +    {
 +        cg1 = cr->dd->ncg_tot;
 +    }
 +    else
 +    {
 +        cg1 = top->cgs.nr;
 +    }
 +    if (fr->n_tpi > 0)
 +    {
 +        cg1--;
 +    }
 +
 +    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE); 
 +    bFillGrid     = (bNS && bStateChanged);
 +    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 +    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
 +    bDoForces     = (flags & GMX_FORCE_FORCES);
 +    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 +    bUseGPU       = fr->nbv->bUseGPU;
 +    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbk8x8x8_PlainC);
 +
 +    if (bStateChanged)
 +    {
 +        update_forcerec(fplog,fr,box);
 +
 +        if (NEED_MUTOT(*inputrec))
 +        {
 +            /* Calculate total (local) dipole moment in a temporary common array.
 +             * This makes it possible to sum them over nodes faster.
 +             */
 +            calc_mu(start,homenr,
 +                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
 +                    mu,mu+DIM);
 +        }
 +    }
 +
 +    if (fr->ePBC != epbcNONE) { 
 +        /* Compute shift vectors every step,
 +         * because of pressure coupling or box deformation!
 +         */
 +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 +            calc_shifts(box,fr->shift_vec);
 +
 +        if (bCalcCGCM) { 
 +            put_atoms_in_box_omp(fr->ePBC,box,homenr,x);
 +            inc_nrnb(nrnb,eNR_SHIFTX,homenr);
 +        } 
 +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
 +            unshift_self(graph,box,x);
 +        }
 +    } 
 +
 +    nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
 +                                  fr->shift_vec,nbv->grp[0].nbat);
 +
 +#ifdef GMX_MPI
 +    if (!(cr->duty & DUTY_PME)) {
 +        /* Send particle coordinates to the pme nodes.
 +         * Since this is only implemented for domain decomposition
 +         * and domain decomposition does not use the graph,
 +         * we do not need to worry about shifting.
 +         */    
 +
 +        wallcycle_start(wcycle,ewcPP_PMESENDX);
 +
 +        bBS = (inputrec->nwall == 2);
 +        if (bBS) {
 +            copy_mat(box,boxs);
 +            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +        }
 +
 +        gmx_pme_send_x(cr,bBS ? boxs : box,x,
 +                       mdatoms->nChargePerturbed,lambda[efptCOUL],
 +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
 +
 +        wallcycle_stop(wcycle,ewcPP_PMESENDX);
 +    }
 +#endif /* GMX_MPI */
 +
 +    /* do gridding for pair search */
 +    if (bNS)
 +    {
 +        if (graph && bStateChanged)
 +        {
 +            /* Calculate intramolecular shift vectors to make molecules whole */
 +            mk_mshift(fplog,graph,fr->ePBC,box,x);
 +        }
 +
 +        clear_rvec(vzero);
 +        box_diag[XX] = box[XX][XX];
 +        box_diag[YY] = box[YY][YY];
 +        box_diag[ZZ] = box[ZZ][ZZ];
 +
 +        wallcycle_start(wcycle,ewcNS);
 +        if (!fr->bDomDec)
 +        {
 +            wallcycle_sub_start(wcycle,ewcsNBS_GRID_LOCAL);
 +            nbnxn_put_on_grid(nbv->nbs,fr->ePBC,box,
 +                              0,vzero,box_diag,
 +                              0,mdatoms->homenr,-1,fr->cginfo,x,
 +                              0,NULL,
 +                              nbv->grp[eintLocal].kernel_type,
 +                              nbv->grp[eintLocal].nbat);
 +            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_LOCAL);
 +        }
 +        else
 +        {
 +            wallcycle_sub_start(wcycle,ewcsNBS_GRID_NONLOCAL);
 +            nbnxn_put_on_grid_nonlocal(nbv->nbs,domdec_zones(cr->dd),
 +                                       fr->cginfo,x,
 +                                       nbv->grp[eintNonlocal].kernel_type,
 +                                       nbv->grp[eintNonlocal].nbat);
 +            wallcycle_sub_stop(wcycle,ewcsNBS_GRID_NONLOCAL);
 +        }
 +
 +        if (nbv->ngrp == 1 ||
 +            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
 +        {
 +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatAll,
 +                                nbv->nbs,mdatoms,fr->cginfo);
 +        }
 +        else
 +        {
 +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat,eatLocal,
 +                                nbv->nbs,mdatoms,fr->cginfo);
 +            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat,eatAll,
 +                                nbv->nbs,mdatoms,fr->cginfo);
 +        }
 +        wallcycle_stop(wcycle, ewcNS);
 +    }
 +
 +    /* initialize the GPU atom data and copy shift vector */
 +    if (bUseGPU)
 +    {
 +        if (bNS)
 +        {
 +            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 +            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 +            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 +        }
 +
 +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 +        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
 +        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
 +    }
 +
 +    /* do local pair search */
 +    if (bNS)
 +    {
 +        wallcycle_start_nocount(wcycle,ewcNS);
 +        wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_LOCAL);
 +        nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintLocal].nbat,
 +                            &top->excls,
 +                            ic->rlist,
 +                            nbv->min_ci_balanced,
 +                            &nbv->grp[eintLocal].nbl_lists,
 +                            eintLocal,
 +                            nbv->grp[eintLocal].kernel_type,
 +                            nrnb);
 +        wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_LOCAL);
 +
 +        if (bUseGPU)
 +        {
 +            /* initialize local pair-list on the GPU */
 +            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 +                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
 +                                     eintLocal);
 +        }
 +        wallcycle_stop(wcycle, ewcNS);
 +    }
 +    else
 +    {
 +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 +        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,FALSE,x,
 +                                        nbv->grp[eintLocal].nbat);
 +        wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +    }
 +
 +    if (bUseGPU)
 +    {
 +        wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
 +        /* launch local nonbonded F on GPU */
 +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 +                     nrnb, wcycle);
 +        wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
 +    }
 +
 +    /* Communicate coordinates and sum dipole if necessary + 
 +       do non-local pair search */
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
 +                        nbv->grp[eintLocal].kernel_type);
 +
 +        if (bDiffKernels)
 +        {
 +            /* With GPU+CPU non-bonded calculations we need to copy
 +             * the local coordinates to the non-local nbat struct
 +             * (in CPU format) as the non-local kernel call also
 +             * calculates the local - non-local interactions.
 +             */
 +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatLocal,TRUE,x,
 +                                             nbv->grp[eintNonlocal].nbat);
 +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 +            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        }
 +
 +        if (bNS)
 +        {
 +            wallcycle_start_nocount(wcycle,ewcNS);
 +            wallcycle_sub_start(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 +
 +            if (bDiffKernels)
 +            {
 +                nbnxn_grid_add_simple(nbv->nbs,nbv->grp[eintNonlocal].nbat);
 +            }
 +
 +            nbnxn_make_pairlist(nbv->nbs,nbv->grp[eintNonlocal].nbat,
 +                                &top->excls,
 +                                ic->rlist,
 +                                nbv->min_ci_balanced,
 +                                &nbv->grp[eintNonlocal].nbl_lists,
 +                                eintNonlocal,
 +                                nbv->grp[eintNonlocal].kernel_type,
 +                                nrnb);
 +
 +            wallcycle_sub_stop(wcycle,ewcsNBS_SEARCH_NONLOCAL);
 +
 +            if (nbv->grp[eintNonlocal].kernel_type == nbk8x8x8_CUDA)
 +            {
 +                /* initialize non-local pair-list on the GPU */
 +                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
 +                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
 +                                         eintNonlocal);
 +            }
 +            wallcycle_stop(wcycle,ewcNS);
 +        } 
 +        else
 +        {
 +            wallcycle_start(wcycle,ewcMOVEX);
 +            dd_move_x(cr->dd,box,x);
 +
 +            /* When we don't need the total dipole we sum it in global_stat */
 +            if (bStateChanged && NEED_MUTOT(*inputrec))
 +            {
 +                gmx_sumd(2*DIM,mu,cr);
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEX);
 +
 +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
 +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs,eatNonlocal,FALSE,x,
 +                                            nbv->grp[eintNonlocal].nbat);
 +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
 +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        }
 +
 +        if (bUseGPU && !bDiffKernels)
 +        { 
 +            wallcycle_start(wcycle,ewcLAUNCH_GPU_NB);
 +            /* launch non-local nonbonded F on GPU */
 +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
 +                         nrnb, wcycle);
 +            cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
 +        }
 +    }
 +
 +    if (bUseGPU)
 +    {
 +        /* launch D2H copy-back F */
 +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
 +        if (DOMAINDECOMP(cr) && !bDiffKernels)
 +        {
 +            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
 +                                      flags, eatNonlocal);
 +        }
 +        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
 +                                  flags, eatLocal);
 +        cycles_force += wallcycle_stop(wcycle,ewcLAUNCH_GPU_NB);
 +    }
 +
 +    if (bStateChanged && NEED_MUTOT(*inputrec))
 +    {
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(2*DIM,mu,cr);
 +        } 
 +
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0;j<DIM;j++)
 +            {
 +                fr->mu_tot[i][j] = mu[i*DIM + j];
 +            }
 +        }
 +    }
 +    if (fr->efep == efepNO)
 +    {
 +        copy_rvec(fr->mu_tot[0],mu_tot);
 +    }
 +    else
 +    {
 +        for(j=0; j<DIM; j++)
 +        {
 +            mu_tot[j] =
 +                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
 +                lambda[efptCOUL]*fr->mu_tot[1][j];
 +        }
 +    }
 +
 +    /* Reset energies */
 +    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
 +    clear_rvecs(SHIFTS,fr->fshift);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (!(cr->duty & DUTY_PME))
 +        {
 +            wallcycle_start(wcycle,ewcPPDURINGPME);
 +            dd_force_flop_start(cr->dd,nrnb);
 +        }
 +    }
 +    
 +    /* Start the force cycle counter.
 +     * This counter is stopped in do_forcelow_level.
 +     * No parallel communication should occur while this counter is running,
 +     * since that will interfere with the dynamic load balancing.
 +     */
 +    wallcycle_start(wcycle,ewcFORCE);
 +    if (bDoForces)
 +    {
 +        /* Reset forces for which the virial is calculated separately:
 +         * PME/Ewald forces if necessary */
 +        if (fr->bF_NoVirSum) 
 +        {
 +            if (flags & GMX_FORCE_VIRIAL)
 +            {
 +                fr->f_novirsum = fr->f_novirsum_alloc;
 +                if (fr->bDomDec)
 +                {
 +                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
 +                }
 +                else
 +                {
 +                    clear_rvecs(homenr,fr->f_novirsum+start);
 +                }
 +            }
 +            else
 +            {
 +                /* We are not calculating the pressure so we do not need
 +                 * a separate array for forces that do not contribute
 +                 * to the pressure.
 +                 */
 +                fr->f_novirsum = f;
 +            }
 +        }
 +
 +        if (bSepLRF)
 +        {
 +            /* Add the long range forces to the short range forces */
 +            for(i=0; i<fr->natoms_force_constr; i++)
 +            {
 +                copy_rvec(fr->f_twin[i],f[i]);
 +            }
 +        }
 +        else if (!(fr->bTwinRange && bNS))
 +        {
 +            /* Clear the short-range forces */
 +            clear_rvecs(fr->natoms_force_constr,f);
 +        }
 +
 +        clear_rvec(fr->vir_diag_posres);
 +    }
 +    if (inputrec->ePull == epullCONSTRAINT)
 +    {
 +        clear_pull_forces(inputrec->pull);
 +    }
 +
 +    /* update QMMMrec, if necessary */
 +    if(fr->bQMMM)
 +    {
 +        update_QMMMrec(cr,fr,x,mdatoms,box,top);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    {
 +        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
 +                       f,enerd,lambda,fr);
 +    }
 +
 +    /* Compute the bonded and non-bonded energies and optionally forces */    
 +    /* if we use the GPU turn off the nonbonded */
 +    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
 +                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
 +                      x,hist,f,enerd,fcd,mtop,top,fr->born,
 +                      &(top->atomtypes),bBornRadii,box,
 +                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
 +                      ((nb_kernel_type == nbk8x8x8_CUDA || nb_kernel_type == nbk8x8x8_PlainC) 
 +                        ? flags&~GMX_FORCE_NONBONDED : flags),
 +                      &cycles_pme);
 +
 +    if (!bUseOrEmulGPU)
 +    {
 +        /* Maybe we should move this into do_force_lowlevel */
 +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
 +                     nrnb, wcycle);
 +    }
 +        
 +
 +    if (!bUseOrEmulGPU || bDiffKernels)
 +    {
 +        int aloc;
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
 +                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
 +                         nrnb, wcycle);
 +        }
 +
 +        if (!bUseOrEmulGPU)
 +        {
 +            aloc = eintLocal;
 +        }
 +        else
 +        {
 +            aloc = eintNonlocal;
 +        }
 +
 +        /* Add all the non-bonded force to the normal force array.
 +         * This can be split into a local a non-local part when overlapping
 +         * communication with calculation with domain decomposition.
 +         */
 +        cycles_force += wallcycle_stop(wcycle,ewcFORCE);
 +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 +        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatAll,nbv->grp[aloc].nbat,f);
 +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
 +        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_start_nocount(wcycle,ewcFORCE);
 +
 +        /* if there are multiple fshift output buffers reduce them */
 +        if ((flags & GMX_FORCE_VIRIAL) &&
 +            nbv->grp[aloc].nbl_lists.nnbl > 1)
 +        {
 +            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
 +                                                      fr->fshift);
 +        }
 +    }
 +    
 +    cycles_force += wallcycle_stop(wcycle,ewcFORCE);
 +    
 +    if (ed)
 +    {
 +        do_flood(fplog,cr,x,f,ed,box,step,bNS);
 +    }
 +
 +    if (bUseOrEmulGPU && !bDiffKernels)
 +    {
 +        /* wait for non-local forces (or calculate in emulation mode) */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            if (bUseGPU)
 +            {
 +                wallcycle_start(wcycle,ewcWAIT_GPU_NB_NL);
 +                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
 +                                    nbv->grp[eintNonlocal].nbat,
 +                                    flags, eatNonlocal,
 +                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                    fr->fshift);
 +                cycles_force += wallcycle_stop(wcycle,ewcWAIT_GPU_NB_NL);
 +            }
 +            else
 +            {
 +                wallcycle_start_nocount(wcycle,ewcFORCE);
 +                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
 +                             nrnb, wcycle);
 +                cycles_force += wallcycle_stop(wcycle,ewcFORCE);
 +            }            
 +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +            wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 +            /* skip the reduction if there was no non-local work to do */
 +            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
 +            {
 +                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatNonlocal,
 +                                               nbv->grp[eintNonlocal].nbat,f);
 +            }
 +            wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
 +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +        }
 +    }
 +
 +    if (bDoForces)
 +    {
 +        /* Communicate the forces */
 +        if (PAR(cr))
 +        {
 +            wallcycle_start(wcycle,ewcMOVEF);
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_move_f(cr->dd,f,fr->fshift);
 +                /* Do we need to communicate the separate force array
 +                 * for terms that do not contribute to the single sum virial?
 +                 * Position restraints and electric fields do not introduce
 +                 * inter-cg forces, only full electrostatics methods do.
 +                 * When we do not calculate the virial, fr->f_novirsum = f,
 +                 * so we have already communicated these forces.
 +                 */
 +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
 +                    (flags & GMX_FORCE_VIRIAL))
 +                {
 +                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
 +                }
 +                if (bSepLRF)
 +                {
 +                    /* We should not update the shift forces here,
 +                     * since f_twin is already included in f.
 +                     */
 +                    dd_move_f(cr->dd,fr->f_twin,NULL);
 +                }
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEF);
 +        }
 +    }
 + 
 +    if (bUseOrEmulGPU)
 +    {
 +        /* wait for local forces (or calculate in emulation mode) */
 +        if (bUseGPU)
 +        {
 +            wallcycle_start(wcycle,ewcWAIT_GPU_NB_L);
 +            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
 +                                nbv->grp[eintLocal].nbat,
 +                                flags, eatLocal,
 +                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
 +                                fr->fshift);
 +            wallcycle_stop(wcycle,ewcWAIT_GPU_NB_L);
 +
 +            /* now clear the GPU outputs while we finish the step on the CPU */
 +            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
 +        }
 +        else
 +        {            
 +            wallcycle_start_nocount(wcycle,ewcFORCE);
 +            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
 +                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
 +                         nrnb, wcycle);
 +            wallcycle_stop(wcycle,ewcFORCE);
 +        }
 +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
 +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 +        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
 +        {
 +            /* skip the reduction if there was no non-local work to do */
 +            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs,eatLocal,
 +                                           nbv->grp[eintLocal].nbat,f);
 +        }
 +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
 +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 +    }
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_force_flop_stop(cr->dd,nrnb);
 +        if (wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
 +        }
 +    }
 +
 +    if (bDoForces)
 +    {
 +        if (IR_ELEC_FIELD(*inputrec))
 +        {
 +            /* Compute forces due to electric field */
 +            calc_f_el(MASTER(cr) ? field : NULL,
 +                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
 +                      inputrec->ex,inputrec->et,t);
 +        }
 +
 +        /* If we have NoVirSum forces, but we do not calculate the virial,
 +         * we sum fr->f_novirum=f later.
 +         */
 +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
 +        {
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +
 +            if (bSepLRF)
 +            {
 +                wallcycle_start(wcycle,ewcVSITESPREAD);
 +                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
 +                               nrnb,
 +                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +                wallcycle_stop(wcycle,ewcVSITESPREAD);
 +            }
 +        }
 +
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Calculation of the virial must be done after vsites! */
 +            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
 +                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
 +        }
 +    }
 +
 +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    {
 +        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
 +                               f,vir_force,mdatoms,enerd,lambda,t);
 +    }
 +
 +    if (PAR(cr) && !(cr->duty & DUTY_PME))
 +    {
 +        /* In case of node-splitting, the PP nodes receive the long-range 
 +         * forces, virial and energy from the PME nodes here.
 +         */    
 +        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
 +    }
 +
 +    if (bDoForces)
 +    {
 +        post_process_forces(fplog,cr,step,nrnb,wcycle,
 +                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
 +                            flags);
 +    }
 +    
 +    /* Sum the potential energy terms from group contributions */
 +    sum_epot(&(inputrec->opts),enerd);
 +}
 +
 +void do_force_cutsGROUP(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real *lambda,t_graph *graph,
 +              t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    int    cg0,cg1,i,j;
 +    int    start,homenr;
 +    double mu[2*DIM];
 +    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
 +    gmx_bool   bDoLongRange,bDoForces,bSepLRF;
 +    gmx_bool   bDoAdressWF;
 +    matrix boxs;
 +    rvec   vzero,box_diag;
 +    real   e,v,dvdlambda[efptNR];
 +    t_pbc  pbc;
 +    float  cycles_pme,cycles_force;
 +
 +    start  = mdatoms->start;
 +    homenr = mdatoms->homenr;
 +
 +    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
 +
 +    clear_mat(vir_force);
 +
 +    if (PARTDECOMP(cr))
 +    {
 +        pd_cg_range(cr,&cg0,&cg1);
 +    }
 +    else
 +    {
 +        cg0 = 0;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            cg1 = cr->dd->ncg_tot;
 +        }
 +        else
 +        {
 +            cg1 = top->cgs.nr;
 +        }
 +        if (fr->n_tpi > 0)
 +        {
 +            cg1--;
 +        }
 +    }
 +
 +    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE);
 +    bFillGrid     = (bNS && bStateChanged);
 +    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 +    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
 +    bDoForces     = (flags & GMX_FORCE_FORCES);
 +    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 +    /* should probably move this to the forcerec since it doesn't change */
 +    bDoAdressWF   = ((fr->adress_type!=eAdressOff));
 +
 +    if (bStateChanged)
 +    {
 +        update_forcerec(fplog,fr,box);
 +
 +        if (NEED_MUTOT(*inputrec))
 +        {
 +            /* Calculate total (local) dipole moment in a temporary common array.
 +             * This makes it possible to sum them over nodes faster.
 +             */
 +            calc_mu(start,homenr,
 +                    x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
 +                    mu,mu+DIM);
 +        }
 +    }
 +
 +    if (fr->ePBC != epbcNONE) { 
 +        /* Compute shift vectors every step,
 +         * because of pressure coupling or box deformation!
 +         */
 +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 +            calc_shifts(box,fr->shift_vec);
 +
 +        if (bCalcCGCM) { 
 +            put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
 +                    &(top->cgs),x,fr->cg_cm);
 +            inc_nrnb(nrnb,eNR_CGCM,homenr);
 +            inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
 +        } 
 +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
 +            unshift_self(graph,box,x);
 +        }
 +    } 
 +    else if (bCalcCGCM) {
 +        calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
 +        inc_nrnb(nrnb,eNR_CGCM,homenr);
 +    }
 +
 +    if (bCalcCGCM) {
 +        if (PAR(cr)) {
 +            move_cgcm(fplog,cr,fr->cg_cm);
 +        }
 +        if (gmx_debug_at)
 +            pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (!(cr->duty & DUTY_PME)) {
 +        /* Send particle coordinates to the pme nodes.
 +         * Since this is only implemented for domain decomposition
 +         * and domain decomposition does not use the graph,
 +         * we do not need to worry about shifting.
 +         */    
 +
 +        wallcycle_start(wcycle,ewcPP_PMESENDX);
 +
 +        bBS = (inputrec->nwall == 2);
 +        if (bBS) {
 +            copy_mat(box,boxs);
 +            svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +        }
 +
 +        gmx_pme_send_x(cr,bBS ? boxs : box,x,
 +                       mdatoms->nChargePerturbed,lambda[efptCOUL],
 +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),step);
 +
 +        wallcycle_stop(wcycle,ewcPP_PMESENDX);
 +    }
 +#endif /* GMX_MPI */
 +
 +    /* Communicate coordinates and sum dipole if necessary */
 +    if (PAR(cr))
 +    {
 +        wallcycle_start(wcycle,ewcMOVEX);
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_move_x(cr->dd,box,x);
 +        }
 +        else
 +        {
 +            move_x(fplog,cr,GMX_LEFT,GMX_RIGHT,x,nrnb);
 +        }
 +        wallcycle_stop(wcycle,ewcMOVEX);
 +    }
 +
 +    /* update adress weight beforehand */
 +    if(bStateChanged && bDoAdressWF)
 +    {
 +        /* need pbc for adress weight calculation with pbc_dx */
 +        set_pbc(&pbc,inputrec->ePBC,box);
 +        if(fr->adress_site == eAdressSITEcog)
 +        {
 +            update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
 +                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        else if (fr->adress_site == eAdressSITEcom)
 +        {
 +            update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                      inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        else if (fr->adress_site == eAdressSITEatomatom){
 +            update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        else
 +        {
 +            update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                       inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +    }
 +
 +    if (NEED_MUTOT(*inputrec))
 +    {
 +
 +        if (bStateChanged)
 +        {
 +            if (PAR(cr))
 +            {
 +                gmx_sumd(2*DIM,mu,cr);
 +            }
 +            for(i=0; i<2; i++)
 +            {
 +                for(j=0;j<DIM;j++)
 +                {
 +                    fr->mu_tot[i][j] = mu[i*DIM + j];
 +                }
 +            }
 +        }
 +        if (fr->efep == efepNO)
 +        {
 +            copy_rvec(fr->mu_tot[0],mu_tot);
 +        }
 +        else
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                mu_tot[j] =
 +                    (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
 +            }
 +        }
 +    }
 +
 +    /* Reset energies */
 +    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
 +    clear_rvecs(SHIFTS,fr->fshift);
 +
 +    if (bNS)
 +    {
 +        wallcycle_start(wcycle,ewcNS);
 +
 +        if (graph && bStateChanged)
 +        {
 +            /* Calculate intramolecular shift vectors to make molecules whole */
 +            mk_mshift(fplog,graph,fr->ePBC,box,x);
 +        }
 +
 +        /* Reset long range forces if necessary */
 +        if (fr->bTwinRange)
 +        {
 +            /* Reset the (long-range) forces if necessary */
 +            clear_rvecs(fr->natoms_force_constr,bSepLRF ? fr->f_twin : f);
 +        }
 +
 +        /* Do the actual neighbour searching and if twin range electrostatics
 +         * also do the calculation of long range forces and energies.
 +         */
 +        for (i=0;i<efptNR;i++) {dvdlambda[i] = 0;}
 +        ns(fplog,fr,x,box,
 +           groups,&(inputrec->opts),top,mdatoms,
 +           cr,nrnb,lambda,dvdlambda,&enerd->grpp,bFillGrid,
 +           bDoLongRange,bDoForces,bSepLRF ? fr->f_twin : f);
 +        if (bSepDVDL)
 +        {
 +            fprintf(fplog,sepdvdlformat,"LR non-bonded",0.0,dvdlambda);
 +        }
 +        enerd->dvdl_lin[efptVDW] += dvdlambda[efptVDW];
 +        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
 +
 +        wallcycle_stop(wcycle,ewcNS);
 +    }
 +
 +    if (inputrec->implicit_solvent && bNS)
 +    {
 +        make_gb_nblist(cr,inputrec->gb_algorithm,inputrec->rlist,
 +                       x,box,fr,&top->idef,graph,fr->born);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (!(cr->duty & DUTY_PME))
 +        {
 +            wallcycle_start(wcycle,ewcPPDURINGPME);
 +            dd_force_flop_start(cr->dd,nrnb);
 +        }
 +    }
 +
 +    if (inputrec->bRot)
 +    {
 +        /* Enforced rotation has its own cycle counter that starts after the collective
 +         * coordinates have been communicated. It is added to ddCyclF to allow
 +         * for proper load-balancing */
 +        wallcycle_start(wcycle,ewcROT);
 +        do_rotation(cr,inputrec,box,x,t,step,wcycle,bNS);
 +        wallcycle_stop(wcycle,ewcROT);
 +    }
 +
 +    /* Start the force cycle counter.
 +     * This counter is stopped in do_forcelow_level.
 +     * No parallel communication should occur while this counter is running,
 +     * since that will interfere with the dynamic load balancing.
 +     */
 +    wallcycle_start(wcycle,ewcFORCE);
 +    
 +    if (bDoForces)
 +    {
 +        /* Reset forces for which the virial is calculated separately:
 +         * PME/Ewald forces if necessary */
 +        if (fr->bF_NoVirSum)
 +        {
 +            if (flags & GMX_FORCE_VIRIAL)
 +            {
 +                fr->f_novirsum = fr->f_novirsum_alloc;
 +                if (fr->bDomDec)
 +                {
 +                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
 +                }
 +                else
 +                {
 +                    clear_rvecs(homenr,fr->f_novirsum+start);
 +                }
 +            }
 +            else
 +            {
 +                /* We are not calculating the pressure so we do not need
 +                 * a separate array for forces that do not contribute
 +                 * to the pressure.
 +                 */
 +                fr->f_novirsum = f;
 +            }
 +        }
 +
 +        if (bSepLRF)
 +        {
 +            /* Add the long range forces to the short range forces */
 +            for(i=0; i<fr->natoms_force_constr; i++)
 +            {
 +                copy_rvec(fr->f_twin[i],f[i]);
 +            }
 +        }
 +        else if (!(fr->bTwinRange && bNS))
 +        {
 +            /* Clear the short-range forces */
 +            clear_rvecs(fr->natoms_force_constr,f);
 +        }
 +
 +        clear_rvec(fr->vir_diag_posres);
 +    }
 +    if (inputrec->ePull == epullCONSTRAINT)
 +    {
 +        clear_pull_forces(inputrec->pull);
 +    }
 +
 +    /* update QMMMrec, if necessary */
 +    if(fr->bQMMM)
 +    {
 +        update_QMMMrec(cr,fr,x,mdatoms,box,top);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    {
 +        posres_wrapper(fplog,flags,bSepDVDL,inputrec,nrnb,top,box,x,
 +                       f,enerd,lambda,fr);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
 +    {
 +        /* Flat-bottomed position restraints always require full pbc */
 +        if(!(bStateChanged && bDoAdressWF))
 +        {
 +            set_pbc(&pbc,inputrec->ePBC,box);
 +        }
 +        v = fbposres(top->idef.il[F_FBPOSRES].nr,top->idef.il[F_FBPOSRES].iatoms,
 +                     top->idef.iparams_fbposres,
 +                     (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
 +                     inputrec->ePBC==epbcNONE ? NULL : &pbc,
 +                     fr->rc_scaling,fr->ePBC,fr->posres_com);
 +        enerd->term[F_FBPOSRES] += v;
 +        inc_nrnb(nrnb,eNR_FBPOSRES,top->idef.il[F_FBPOSRES].nr/2);
 +    }
 +
 +    /* Compute the bonded and non-bonded energies and optionally forces */
 +    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
 +                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
 +                      x,hist,f,enerd,fcd,mtop,top,fr->born,
 +                      &(top->atomtypes),bBornRadii,box,
 +                      inputrec->fepvals,lambda,
 +                      graph,&(top->excls),fr->mu_tot,
 +                      flags,
 +                      &cycles_pme);
 +
 +    cycles_force = wallcycle_stop(wcycle,ewcFORCE);
 +
 +    if (ed)
 +    {
 +        do_flood(fplog,cr,x,f,ed,box,step,bNS);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_force_flop_stop(cr->dd,nrnb);
 +        if (wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
 +        }
 +    }
 +
 +    if (bDoForces)
 +    {
 +        if (IR_ELEC_FIELD(*inputrec))
 +        {
 +            /* Compute forces due to electric field */
 +            calc_f_el(MASTER(cr) ? field : NULL,
 +                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
 +                      inputrec->ex,inputrec->et,t);
 +        }
 +
 +        if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
 +        {
 +            /* Compute thermodynamic force in hybrid AdResS region */
 +            adress_thermo_force(start,homenr,&(top->cgs),x,fr->f_novirsum,fr,mdatoms,
 +                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +
 +        /* Communicate the forces */
 +        if (PAR(cr))
 +        {
 +            wallcycle_start(wcycle,ewcMOVEF);
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_move_f(cr->dd,f,fr->fshift);
 +                /* Do we need to communicate the separate force array
 +                 * for terms that do not contribute to the single sum virial?
 +                 * Position restraints and electric fields do not introduce
 +                 * inter-cg forces, only full electrostatics methods do.
 +                 * When we do not calculate the virial, fr->f_novirsum = f,
 +                 * so we have already communicated these forces.
 +                 */
 +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
 +                    (flags & GMX_FORCE_VIRIAL))
 +                {
 +                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
 +                }
 +                if (bSepLRF)
 +                {
 +                    /* We should not update the shift forces here,
 +                     * since f_twin is already included in f.
 +                     */
 +                    dd_move_f(cr->dd,fr->f_twin,NULL);
 +                }
 +            }
 +            else
 +            {
 +                pd_move_f(cr,f,nrnb);
 +                if (bSepLRF)
 +                {
 +                    pd_move_f(cr,fr->f_twin,nrnb);
 +                }
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEF);
 +        }
 +
 +        /* If we have NoVirSum forces, but we do not calculate the virial,
 +         * we sum fr->f_novirum=f later.
 +         */
 +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
 +        {
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +
 +            if (bSepLRF)
 +            {
 +                wallcycle_start(wcycle,ewcVSITESPREAD);
 +                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
 +                               nrnb,
 +                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +                wallcycle_stop(wcycle,ewcVSITESPREAD);
 +            }
 +        }
 +
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Calculation of the virial must be done after vsites! */
 +            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
 +                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
 +        }
 +    }
 +
 +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    {
 +        pull_potential_wrapper(fplog,bSepDVDL,cr,inputrec,box,x,
 +                               f,vir_force,mdatoms,enerd,lambda,t);
 +    }
 +
 +    /* Add the forces from enforced rotation potentials (if any) */
 +    if (inputrec->bRot)
 +    {
 +        wallcycle_start(wcycle,ewcROTadd);
 +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr,step,t);
 +        wallcycle_stop(wcycle,ewcROTadd);
 +    }
 +
 +    if (PAR(cr) && !(cr->duty & DUTY_PME))
 +    {
 +        /* In case of node-splitting, the PP nodes receive the long-range 
 +         * forces, virial and energy from the PME nodes here.
 +         */
 +        pme_receive_force_ener(fplog,bSepDVDL,cr,wcycle,enerd,fr);
 +    }
 +
 +    if (bDoForces)
 +    {
 +        post_process_forces(fplog,cr,step,nrnb,wcycle,
 +                            top,box,x,f,vir_force,mdatoms,graph,fr,vsite,
 +                            flags);
 +    }
 +
 +    /* Sum the potential energy terms from group contributions */
 +    sum_epot(&(inputrec->opts),enerd);
 +}
 +
 +void do_force(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real *lambda,t_graph *graph,
 +              t_forcerec *fr,
 +              gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    /* modify force flag if not doing nonbonded */
 +    if (!fr->bNonbonded)
 +    {
 +        flags &= ~GMX_FORCE_NONBONDED;
 +    }
 +
 +    switch (inputrec->cutoff_scheme)
 +    {
 +        case ecutsVERLET:
 +            do_force_cutsVERLET(fplog, cr, inputrec,
 +                                step, nrnb, wcycle,
 +                                top, mtop,
 +                                groups,
 +                                box, x, hist,
 +                                f, vir_force,
 +                                mdatoms,
 +                                enerd, fcd,
 +                                lambda, graph,
 +                                fr, fr->ic, 
 +                                vsite, mu_tot,
 +                                t, field, ed,
 +                                bBornRadii,
 +                                flags);
 +            break;
 +        case ecutsGROUP:
 +             do_force_cutsGROUP(fplog, cr, inputrec,
 +                                step, nrnb, wcycle,
 +                                top, mtop,
 +                                groups,
 +                                box, x, hist,
 +                                f, vir_force,
 +                                mdatoms,
 +                                enerd, fcd,
 +                                lambda, graph,
 +                                fr, vsite, mu_tot,
 +                                t, field, ed,
 +                                bBornRadii,
 +                                flags);
 +            break;
 +        default:
 +            gmx_incons("Invalid cut-off scheme passed!");
 +    }
 +}
 +
 +
 +void do_constrain_first(FILE *fplog,gmx_constr_t constr,
 +                        t_inputrec *ir,t_mdatoms *md,
 +                        t_state *state,rvec *f,
 +                        t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
 +                        t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir)
 +{
 +    int    i,m,start,end;
 +    gmx_large_int_t step;
 +    real   dt=ir->delta_t;
 +    real   dvdl_dum;
 +    rvec   *savex;
 +
 +    snew(savex,state->natoms);
 +
 +    start = md->start;
 +    end   = md->homenr + start;
 +
 +    if (debug)
 +        fprintf(debug,"vcm: start=%d, homenr=%d, end=%d\n",
 +                start,md->homenr,end);
 +    /* Do a first constrain to reset particles... */
 +    step = ir->init_step;
 +    if (fplog)
 +    {
 +        char buf[STEPSTRSIZE];
 +        fprintf(fplog,"\nConstraining the starting coordinates (step %s)\n",
 +                gmx_step_str(step,buf));
 +    }
 +    dvdl_dum = 0;
 +
 +    /* constrain the current position */
 +    constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +              ir,NULL,cr,step,0,md,
 +              state->x,state->x,NULL,
 +              fr->bMolPBC,state->box,
 +              state->lambda[efptBONDED],&dvdl_dum,
 +              NULL,NULL,nrnb,econqCoord,
 +              ir->epc==epcMTTK,state->veta,state->veta);
 +    if (EI_VV(ir->eI))
 +    {
 +        /* constrain the inital velocity, and save it */
 +        /* also may be useful if we need the ekin from the halfstep for velocity verlet */
 +        /* might not yet treat veta correctly */
 +        constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +                  ir,NULL,cr,step,0,md,
 +                  state->x,state->v,state->v,
 +                  fr->bMolPBC,state->box,
 +                  state->lambda[efptBONDED],&dvdl_dum,
 +                  NULL,NULL,nrnb,econqVeloc,
 +                  ir->epc==epcMTTK,state->veta,state->veta);
 +    }
 +    /* constrain the inital velocities at t-dt/2 */
 +    if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
 +    {
 +        for(i=start; (i<end); i++)
 +        {
 +            for(m=0; (m<DIM); m++)
 +            {
 +                /* Reverse the velocity */
 +                state->v[i][m] = -state->v[i][m];
 +                /* Store the position at t-dt in buf */
 +                savex[i][m] = state->x[i][m] + dt*state->v[i][m];
 +            }
 +        }
 +    /* Shake the positions at t=-dt with the positions at t=0
 +     * as reference coordinates.
 +         */
 +        if (fplog)
 +        {
 +            char buf[STEPSTRSIZE];
 +            fprintf(fplog,"\nConstraining the coordinates at t0-dt (step %s)\n",
 +                    gmx_step_str(step,buf));
 +        }
 +        dvdl_dum = 0;
 +        constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +                  ir,NULL,cr,step,-1,md,
 +                  state->x,savex,NULL,
 +                  fr->bMolPBC,state->box,
 +                  state->lambda[efptBONDED],&dvdl_dum,
 +                  state->v,NULL,nrnb,econqCoord,
 +                  ir->epc==epcMTTK,state->veta,state->veta);
 +        
 +        for(i=start; i<end; i++) {
 +            for(m=0; m<DIM; m++) {
 +                /* Re-reverse the velocities */
 +                state->v[i][m] = -state->v[i][m];
 +            }
 +        }
 +    }
 +    sfree(savex);
 +}
 +
 +void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr)
 +{
 +  double eners[2],virs[2],enersum,virsum,y0,f,g,h;
 +  double r0,r1,r,rc3,rc9,ea,eb,ec,pa,pb,pc,pd;
 +  double invscale,invscale2,invscale3;
 +  int    ri0,ri1,ri,i,offstart,offset;
 +  real   scale,*vdwtab;
 +
 +  fr->enershiftsix = 0;
 +  fr->enershifttwelve = 0;
 +  fr->enerdiffsix = 0;
 +  fr->enerdifftwelve = 0;
 +  fr->virdiffsix = 0;
 +  fr->virdifftwelve = 0;
 +
 +  if (eDispCorr != edispcNO) {
 +    for(i=0; i<2; i++) {
 +      eners[i] = 0;
 +      virs[i]  = 0;
 +    }
 +    if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT)) {
 +      if (fr->rvdw_switch == 0)
 +      gmx_fatal(FARGS,
 +                "With dispersion correction rvdw-switch can not be zero "
 +                "for vdw-type = %s",evdw_names[fr->vdwtype]);
 +
 +      scale  = fr->nblists[0].tab.scale;
 +      vdwtab = fr->nblists[0].vdwtab;
 +
 +      /* Round the cut-offs to exact table values for precision */
 +      ri0 = floor(fr->rvdw_switch*scale);
 +      ri1 = ceil(fr->rvdw*scale);
 +      r0  = ri0/scale;
 +      r1  = ri1/scale;
 +      rc3 = r0*r0*r0;
 +      rc9  = rc3*rc3*rc3;
 +
 +      if (fr->vdwtype == evdwSHIFT) {
 +      /* Determine the constant energy shift below rvdw_switch */
 +      fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - vdwtab[8*ri0];
 +      fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - vdwtab[8*ri0 + 4];
 +      }
 +      /* Add the constant part from 0 to rvdw_switch.
 +       * This integration from 0 to rvdw_switch overcounts the number
 +       * of interactions by 1, as it also counts the self interaction.
 +       * We will correct for this later.
 +       */
 +      eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
 +      eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
 +
 +      invscale = 1.0/(scale);
 +      invscale2 = invscale*invscale;
 +      invscale3 = invscale*invscale2;
 +
 +      /* following summation derived from cubic spline definition,
 +      Numerical Recipies in C, second edition, p. 113-116.  Exact
 +      for the cubic spline.  We first calculate the negative of
 +      the energy from rvdw to rvdw_switch, assuming that g(r)=1,
 +      and then add the more standard, abrupt cutoff correction to
 +      that result, yielding the long-range correction for a
 +      switched function.  We perform both the pressure and energy
 +      loops at the same time for simplicity, as the computational
 +      cost is low. */
 +
 +      for (i=0;i<2;i++) {
 +        enersum = 0.0; virsum = 0.0;
 +        if (i==0)
 +        offstart = 0;
 +      else
 +        offstart = 4;
 +      for (ri=ri0; ri<ri1; ri++) {
 +          r = ri*invscale;
 +          ea = invscale3;
 +          eb = 2.0*invscale2*r;
 +          ec = invscale*r*r;
 +
 +          pa = invscale3;
 +          pb = 3.0*invscale2*r;
 +          pc = 3.0*invscale*r*r;
 +          pd = r*r*r;
 +
 +          /* this "8" is from the packing in the vdwtab array - perhaps
 +          should be #define'ed? */
 +          offset = 8*ri + offstart;
 +          y0 = vdwtab[offset];
 +          f = vdwtab[offset+1];
 +          g = vdwtab[offset+2];
 +          h = vdwtab[offset+3];
 +
 +          enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2)+
 +            g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
 +          virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) +
 +            2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
 +
 +        }
 +        enersum *= 4.0*M_PI;
 +        virsum  *= 4.0*M_PI;
 +        eners[i] -= enersum;
 +        virs[i]  -= virsum;
 +      }
 +
 +      /* now add the correction for rvdw_switch to infinity */
 +      eners[0] += -4.0*M_PI/(3.0*rc3);
 +      eners[1] +=  4.0*M_PI/(9.0*rc9);
 +      virs[0]  +=  8.0*M_PI/rc3;
 +      virs[1]  += -16.0*M_PI/(3.0*rc9);
 +    }
 +    else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER)) {
 +      if (fr->vdwtype == evdwUSER && fplog)
 +      fprintf(fplog,
 +              "WARNING: using dispersion correction with user tables\n");
 +      rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
 +      rc9  = rc3*rc3*rc3;
 +      /* Contribution beyond the cut-off */
 +      eners[0] += -4.0*M_PI/(3.0*rc3);
 +      eners[1] +=  4.0*M_PI/(9.0*rc9);
 +      if (fr->vdw_pot_shift) {
 +          /* Contribution within the cut-off */
 +          eners[0] += -4.0*M_PI/(3.0*rc3);
 +          eners[1] +=  4.0*M_PI/(3.0*rc9);
 +      }
 +      /* Contribution beyond the cut-off */
 +      virs[0]  +=  8.0*M_PI/rc3;
 +      virs[1]  += -16.0*M_PI/(3.0*rc9);
 +    } else {
 +      gmx_fatal(FARGS,
 +              "Dispersion correction is not implemented for vdw-type = %s",
 +              evdw_names[fr->vdwtype]);
 +    }
 +    fr->enerdiffsix    = eners[0];
 +    fr->enerdifftwelve = eners[1];
 +    /* The 0.5 is due to the Gromacs definition of the virial */
 +    fr->virdiffsix     = 0.5*virs[0];
 +    fr->virdifftwelve  = 0.5*virs[1];
 +  }
 +}
 +
 +void calc_dispcorr(FILE *fplog,t_inputrec *ir,t_forcerec *fr,
 +                   gmx_large_int_t step,int natoms,
 +                   matrix box,real lambda,tensor pres,tensor virial,
 +                   real *prescorr, real *enercorr, real *dvdlcorr)
 +{
 +    gmx_bool bCorrAll,bCorrPres;
 +    real dvdlambda,invvol,dens,ninter,avcsix,avctwelve,enerdiff,svir=0,spres=0;
 +    int  m;
 +
 +    *prescorr = 0;
 +    *enercorr = 0;
 +    *dvdlcorr = 0;
 +
 +    clear_mat(virial);
 +    clear_mat(pres);
 +
 +    if (ir->eDispCorr != edispcNO) {
 +        bCorrAll  = (ir->eDispCorr == edispcAllEner ||
 +                     ir->eDispCorr == edispcAllEnerPres);
 +        bCorrPres = (ir->eDispCorr == edispcEnerPres ||
 +                     ir->eDispCorr == edispcAllEnerPres);
 +
 +        invvol = 1/det(box);
 +        if (fr->n_tpi)
 +        {
 +            /* Only correct for the interactions with the inserted molecule */
 +            dens = (natoms - fr->n_tpi)*invvol;
 +            ninter = fr->n_tpi;
 +        }
 +        else
 +        {
 +            dens = natoms*invvol;
 +            ninter = 0.5*natoms;
 +        }
 +
 +        if (ir->efep == efepNO)
 +        {
 +            avcsix    = fr->avcsix[0];
 +            avctwelve = fr->avctwelve[0];
 +        }
 +        else
 +        {
 +            avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
 +            avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
 +        }
 +
 +        enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
 +        *enercorr += avcsix*enerdiff;
 +        dvdlambda = 0.0;
 +        if (ir->efep != efepNO)
 +        {
 +            dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
 +        }
 +        if (bCorrAll)
 +        {
 +            enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
 +            *enercorr += avctwelve*enerdiff;
 +            if (fr->efep != efepNO)
 +            {
 +                dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
 +            }
 +        }
 +
 +        if (bCorrPres)
 +        {
 +            svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
 +            if (ir->eDispCorr == edispcAllEnerPres)
 +            {
 +                svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
 +            }
 +            /* The factor 2 is because of the Gromacs virial definition */
 +            spres = -2.0*invvol*svir*PRESFAC;
 +
 +            for(m=0; m<DIM; m++) {
 +                virial[m][m] += svir;
 +                pres[m][m] += spres;
 +            }
 +            *prescorr += spres;
 +        }
 +
 +        /* Can't currently control when it prints, for now, just print when degugging */
 +        if (debug)
 +        {
 +            if (bCorrAll) {
 +                fprintf(debug,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                        avcsix,avctwelve);
 +            }
 +            if (bCorrPres)
 +            {
 +                fprintf(debug,
 +                        "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
 +                        *enercorr,spres,svir);
 +            }
 +            else
 +            {
 +                fprintf(debug,"Long Range LJ corr.: Epot %10g\n",*enercorr);
 +            }
 +        }
 +
 +        if (fr->bSepDVDL && do_per_step(step,ir->nstlog))
 +        {
 +            fprintf(fplog,sepdvdlformat,"Dispersion correction",
 +                    *enercorr,dvdlambda);
 +        }
 +        if (fr->efep != efepNO)
 +        {
 +            *dvdlcorr += dvdlambda;
 +        }
 +    }
 +}
 +
 +void do_pbc_first(FILE *fplog,matrix box,t_forcerec *fr,
 +                t_graph *graph,rvec x[])
 +{
 +  if (fplog)
 +    fprintf(fplog,"Removing pbc first time\n");
 +  calc_shifts(box,fr->shift_vec);
 +  if (graph) {
 +    mk_mshift(fplog,graph,fr->ePBC,box,x);
 +    if (gmx_debug_at)
 +      p_graph(debug,"do_pbc_first 1",graph);
 +    shift_self(graph,box,x);
 +    /* By doing an extra mk_mshift the molecules that are broken
 +     * because they were e.g. imported from another software
 +     * will be made whole again. Such are the healing powers
 +     * of GROMACS.
 +     */
 +    mk_mshift(fplog,graph,fr->ePBC,box,x);
 +    if (gmx_debug_at)
 +      p_graph(debug,"do_pbc_first 2",graph);
 +  }
 +  if (fplog)
 +    fprintf(fplog,"Done rmpbc\n");
 +}
 +
 +static void low_do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +                          gmx_mtop_t *mtop,rvec x[],
 +                          gmx_bool bFirst)
 +{
 +  t_graph *graph;
 +  int mb,as,mol;
 +  gmx_molblock_t *molb;
 +
 +  if (bFirst && fplog)
 +    fprintf(fplog,"Removing pbc first time\n");
 +
 +  snew(graph,1);
 +  as = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    if (molb->natoms_mol == 1 ||
 +      (!bFirst && mtop->moltype[molb->type].cgs.nr == 1)) {
 +      /* Just one atom or charge group in the molecule, no PBC required */
 +      as += molb->nmol*molb->natoms_mol;
 +    } else {
 +      /* Pass NULL iso fplog to avoid graph prints for each molecule type */
 +      mk_graph_ilist(NULL,mtop->moltype[molb->type].ilist,
 +                   0,molb->natoms_mol,FALSE,FALSE,graph);
 +
 +      for(mol=0; mol<molb->nmol; mol++) {
 +      mk_mshift(fplog,graph,ePBC,box,x+as);
 +
 +      shift_self(graph,box,x+as);
 +      /* The molecule is whole now.
 +       * We don't need the second mk_mshift call as in do_pbc_first,
 +       * since we no longer need this graph.
 +       */
 +
 +      as += molb->natoms_mol;
 +      }
 +      done_graph(graph);
 +    }
 +  }
 +  sfree(graph);
 +}
 +
 +void do_pbc_first_mtop(FILE *fplog,int ePBC,matrix box,
 +                     gmx_mtop_t *mtop,rvec x[])
 +{
 +  low_do_pbc_mtop(fplog,ePBC,box,mtop,x,TRUE);
 +}
 +
 +void do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +               gmx_mtop_t *mtop,rvec x[])
 +{
 +  low_do_pbc_mtop(fplog,ePBC,box,mtop,x,FALSE);
 +}
 +
 +void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
 +                t_inputrec *inputrec,
 +                t_nrnb nrnb[],gmx_wallcycle_t wcycle,
 +                gmx_runtime_t *runtime,
 +                wallclock_gpu_t *gputimes,
 +                int omp_nth_pp,
 +                gmx_bool bWriteStat)
 +{
 +    int    i,j;
 +    t_nrnb *nrnb_tot=NULL;
 +    real   delta_t;
 +    double nbfs,mflop;
 +
 +    wallcycle_sum(cr,wcycle);
 +
 +    if (cr->nnodes > 1)
 +    {
 +        snew(nrnb_tot,1);
 +#ifdef GMX_MPI
 +        MPI_Allreduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +    }
 +    else
 +    {
 +        nrnb_tot = nrnb;
 +    }
 +
 +#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
 +    if (cr->nnodes > 1)
 +    {
 +        /* reduce nodetime over all MPI processes in the current simulation */
 +        double sum;
 +        MPI_Allreduce(&runtime->proctime,&sum,1,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        runtime->proctime = sum;
 +    }
 +#endif
 +
 +    if (SIMMASTER(cr))
 +    {
 +        print_flop(fplog,nrnb_tot,&nbfs,&mflop);
 +    }
 +    if (cr->nnodes > 1)
 +    {
 +        sfree(nrnb_tot);
 +    }
 +
 +    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
 +    {
 +        print_dd_statistics(cr,inputrec,fplog);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (PARTDECOMP(cr))
 +    {
 +        if (MASTER(cr))
 +        {
 +            t_nrnb     *nrnb_all;
 +            int        s;
 +            MPI_Status stat;
 +
 +            snew(nrnb_all,cr->nnodes);
 +            nrnb_all[0] = *nrnb;
 +            for(s=1; s<cr->nnodes; s++)
 +            {
 +                MPI_Recv(nrnb_all[s].n,eNRNB,MPI_DOUBLE,s,0,
 +                         cr->mpi_comm_mysim,&stat);
 +            }
 +            pr_load(fplog,cr,nrnb_all);
 +            sfree(nrnb_all);
 +        }
 +        else
 +        {
 +            MPI_Send(nrnb->n,eNRNB,MPI_DOUBLE,MASTERRANK(cr),0,
 +                     cr->mpi_comm_mysim);
 +        }
 +    }
 +#endif
 +
 +    if (SIMMASTER(cr))
 +    {
 +        wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
 +                        wcycle,gputimes);
 +
 +        if (EI_DYNAMICS(inputrec->eI))
 +        {
 +            delta_t = inputrec->delta_t;
 +        }
 +        else
 +        {
 +            delta_t = 0;
 +        }
 +
 +        if (fplog)
 +        {
 +            print_perf(fplog,runtime->proctime,runtime->realtime,
 +                       cr->nnodes-cr->npmenodes,
 +                       runtime->nsteps_done,delta_t,nbfs,mflop,
 +                       omp_nth_pp);
 +        }
 +        if (bWriteStat)
 +        {
 +            print_perf(stderr,runtime->proctime,runtime->realtime,
 +                       cr->nnodes-cr->npmenodes,
 +                       runtime->nsteps_done,delta_t,nbfs,mflop,
 +                       omp_nth_pp);
 +        }
 +    }
 +}
 +
 +extern void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0)
 +{
 +    /* this function works, but could probably use a logic rewrite to keep all the different
 +       types of efep straight. */
 +
 +    int i;
 +    t_lambda *fep = ir->fepvals;
 +
 +    if ((ir->efep==efepNO) && (ir->bSimTemp == FALSE)) {
 +        for (i=0;i<efptNR;i++)  {
 +            lambda[i] = 0.0;
 +            if (lam0)
 +            {
 +                lam0[i] = 0.0;
 +            }
 +        }
 +        return;
 +    } else {
 +        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
 +                                             if checkpoint is set -- a kludge is in for now
 +                                             to prevent this.*/
 +        for (i=0;i<efptNR;i++)
 +        {
 +            /* overwrite lambda state with init_lambda for now for backwards compatibility */
 +            if (fep->init_lambda>=0) /* if it's -1, it was never initializd */
 +            {
 +                lambda[i] = fep->init_lambda;
 +                if (lam0) {
 +                    lam0[i] = lambda[i];
 +                }
 +            }
 +            else
 +            {
 +                lambda[i] = fep->all_lambda[i][*fep_state];
 +                if (lam0) {
 +                    lam0[i] = lambda[i];
 +                }
 +            }
 +        }
 +        if (ir->bSimTemp) {
 +            /* need to rescale control temperatures to match current state */
 +            for (i=0;i<ir->opts.ngtc;i++) {
 +                if (ir->opts.ref_t[i] > 0) {
 +                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Send to the log the information on the current lambdas */
 +    if (fplog != NULL)
 +    {
 +        fprintf(fplog,"Initial vector of lambda components:[ ");
 +        for (i=0;i<efptNR;i++)
 +        {
 +            fprintf(fplog,"%10.4f ",lambda[i]);
 +        }
 +        fprintf(fplog,"]\n");
 +    }
 +    return;
 +}
 +
 +
 +void init_md(FILE *fplog,
 +             t_commrec *cr,t_inputrec *ir,const output_env_t oenv,
 +             double *t,double *t0,
 +             real *lambda, int *fep_state, double *lam0,
 +             t_nrnb *nrnb,gmx_mtop_t *mtop,
 +             gmx_update_t *upd,
 +             int nfile,const t_filenm fnm[],
 +             gmx_mdoutf_t **outf,t_mdebin **mdebin,
 +             tensor force_vir,tensor shake_vir,rvec mu_tot,
 +             gmx_bool *bSimAnn,t_vcm **vcm, t_state *state, unsigned long Flags)
 +{
 +    int  i,j,n;
 +    real tmpt,mod;
 +
 +    /* Initial values */
 +    *t = *t0       = ir->init_t;
 +
 +    *bSimAnn=FALSE;
 +    for(i=0;i<ir->opts.ngtc;i++)
 +    {
 +        /* set bSimAnn if any group is being annealed */
 +        if(ir->opts.annealing[i]!=eannNO)
 +        {
 +            *bSimAnn = TRUE;
 +        }
 +    }
 +    if (*bSimAnn)
 +    {
 +        update_annealing_target_temp(&(ir->opts),ir->init_t);
 +    }
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog,ir,fep_state,lambda,lam0);
 +
 +    if (upd)
 +    {
 +        *upd = init_update(fplog,ir);
 +    }
 +
 +
 +    if (vcm != NULL)
 +    {
 +        *vcm = init_vcm(fplog,&mtop->groups,ir);
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
 +    {
 +        if (ir->etc == etcBERENDSEN)
 +        {
 +            please_cite(fplog,"Berendsen84a");
 +        }
 +        if (ir->etc == etcVRESCALE)
 +        {
 +            please_cite(fplog,"Bussi2007a");
 +        }
 +    }
 +
 +    init_nrnb(nrnb);
 +
 +    if (nfile != -1)
 +    {
 +        *outf = init_mdoutf(nfile,fnm,Flags,cr,ir,oenv);
 +
 +        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
 +                              mtop,ir, (*outf)->fp_dhdl);
 +    }
 +
 +    if (ir->bAdress)
 +    {
 +      please_cite(fplog,"Fritsch12");
 +      please_cite(fplog,"Junghans10");
 +    }
 +    /* Initiate variables */
 +    clear_mat(force_vir);
 +    clear_mat(shake_vir);
 +    clear_rvec(mu_tot);
 +
 +    debug_gmx();
 +}
 +
Simple merge
Simple merge
index 1b9803446c7e044b3397a070f422f2fb19dcb4fa,0000000000000000000000000000000000000000..3ca05779ef78f8ed15c6a4e36936d7b3ea5eef6d
mode 100644,000000..100644
--- /dev/null
@@@ -1,1776 -1,0 +1,1776 @@@
- #include "nbnxn_search.h"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#ifdef __linux
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#include <sys/syscall.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
-         vsite = init_vsite(mtop,cr);
++#include "../mdlib/nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#ifdef GMX_OPENMM
 +#include "md_openmm.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +#ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
 +const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
 +#else
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 +#endif
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t *hw_opt;
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    const char *nbpu_opt;
 +    int nsteps_cmdline;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_nex;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                      mc.nbpu_opt,
 +                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              const char *nbpu_opt,
 +              int nsteps_cmdline, int nstepout,int resetstep,
 +              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
 +              real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt=hw_opt;
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nbpu_opt=nbpu_opt;
 +    mda->nsteps_cmdline=nsteps_cmdline;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_nex=repl_ex_nex;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
 +    fflush(stderr);
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
 +       the main thread returns */
 +    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_ALL_CORES,
 +                     mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
 +                                            int nthreads_tot,
 +                                            int ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        if (hw_opt->nthreads_omp > nthreads_tot)
 +        {
 +            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
 +        }
 +        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* Don't use OpenMP parallelization */
 +        nthreads_tmpi = nthreads_tot;
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
 +    int min_atoms_per_mpi_thread;
 +    char *env;
 +    char sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = tMPI_Thread_get_hw_number();
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
 +        {
 +            /* TODO replace this once we have proper HT detection
 +             * Use only multiples of 4 above 8 threads
 +             * or with an 8-core processor
 +             * (to avoid 6 threads on 8 core processors with 4 real cores).
 +             */
 +            nthreads_new = (nthreads_new/4)*4;
 +        }
 +        else if (nthreads_new > 4)
 +        {
 +            /* Avoid 5 or 7 threads */
 +            nthreads_new = (nthreads_new/2)*2;
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
 +        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp,t_commrec *cr,
 +                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
 +{
 +    char *env;
 +    int  nstlist_orig,nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
 +    int  i;
 +    t_state state_tmp;
 +    gmx_bool bBox,bDD,bCont;
 +    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
 +    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[]={ 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,nstl_fmt,ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",buf);
 +        }
 +        sscanf(env,"%d",&ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE,&ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc,rlist_max);
 +    }
 +
 +    i = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box,state_tmp.box);
 +            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox = TRUE;
 +                bDD  = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig,ir->nstlist,
 +                ir->rlist,rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n\n",buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE *fplog,
 +                                  gmx_hw_info_t *hwinfo,
 +                                  t_commrec *cr,
 +                                  gmx_hw_opt_t *hw_opt,
 +                                  const char *nbpu_opt,
 +                                  t_inputrec *ir,
 +                                  const gmx_mtop_t *mtop,
 +                                  matrix box,
 +                                  gmx_bool *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU,&ls);
 +
 +        calc_verlet_buffer_size(mtop,det(box),ir,
 +                                ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist,rlist_new,
 +                        ls.cluster_size_i,ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog,cr,ir,mtop,box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop,real box_vol)
 +{
 +    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE,&ls);
 +        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
 +                                NULL,&ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +
 +/* Set CPU affinity. Can be important for performance.
 +   On some systems (e.g. Cray) CPU Affinity is set by default.
 +   But default assigning doesn't work (well) with only some ranks
 +   having threads. This causes very low performance.
 +   External tools have cumbersome syntax for setting affinity
 +   in the case that only some ranks have threads.
 +   Thus it is important that GROMACS sets the affinity internally
 +   if only PME is using threads.
 +*/
 +static void set_cpu_affinity(FILE *fplog,
 +                             const t_commrec *cr,
 +                             const gmx_hw_opt_t *hw_opt,
 +                             int nthreads_pme,
 +                             const gmx_hw_info_t *hwinfo,
 +                             const t_inputrec *inputrec)
 +{
 +#ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
 +#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
 +    if (hw_opt->bThreadPinning)
 +    {
 +        int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
 +        int offset;
 +        char *env;
 +
 +        /* threads on this MPI process or TMPI thread */
 +        if (cr->duty & DUTY_PP)
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntNonbonded);
 +        }
 +        else
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntPME);
 +        }
 +
 +        /* map the current process to cores */
 +        thread = 0;
 +        nthread_node = nthread_local;
 +#ifdef GMX_MPI
 +        if (PAR(cr) || MULTISIM(cr))
 +        {
 +            /* We need to determine a scan of the thread counts in this
 +             * compute node.
 +             */
 +            MPI_Comm comm_intra;
 +
 +            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
 +                           &comm_intra);
 +            MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
 +            /* MPI_Scan is inclusive, but here we need exclusive */
 +            thread -= nthread_local;
 +            /* Get the total number of threads on this physical node */
 +            MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
 +            MPI_Comm_free(&comm_intra);
 +        }
 +#endif
 +
 +        offset = 0;
 +        if (hw_opt->core_pinning_offset > 0)
 +        {
 +            offset = hw_opt->core_pinning_offset;
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Applying core pinning offset %d\n", offset);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Applying core pinning offset %d\n", offset);
 +            }
 +        }
 +
 +        /* With Intel Hyper-Threading enabled, we want to pin consecutive
 +         * threads to physical cores when using more threads than physical
 +         * cores or when the user requests so.
 +         */
 +        nthread_hw_max = hwinfo->nthreads_hw_avail;
 +        nphyscore = -1;
 +        if (hw_opt->bPinHyperthreading ||
 +            (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +             nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
 +        {
 +            if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
 +            {
 +                /* We print to stderr on all processes, as we might have
 +                 * different settings on different physical nodes.
 +                 */
 +                if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but non-Intel CPU detected (vendor: %s)\n",
 +                                  gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
 +                }
 +                else
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but the CPU detected does not have Intel Hyper-Threading support "
 +                                  "(or it is turned off)\n");
 +                }
 +            }
 +            nphyscore = nthread_hw_max/2;
 +
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +        }
 +
 +        /* set the per-thread affinity */
 +#pragma omp parallel firstprivate(thread) num_threads(nthread_local)
 +        {
 +            cpu_set_t mask;
 +            int core;
 +
 +            CPU_ZERO(&mask);
 +            thread += gmx_omp_get_thread_num();
 +            if (nphyscore <= 0)
 +            {
 +                core = offset + thread;
 +            }
 +            else
 +            {
 +                /* Lock pairs of threads to the same hyperthreaded core */
 +                core = offset + thread/2 + (thread % 2)*nphyscore;
 +            }
 +            CPU_SET(core, &mask);
 +            sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
 +        }
 +    }
 +#endif /* __linux    */
 +#endif /* GMX_OPENMP */
 +}
 +
 +
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 +                                    int cutoff_scheme)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
 +                
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE *fplog,
 +                                    int nsteps_cmdline,
 +                                    t_inputrec *ir,
 +                                    const t_commrec *cr)
 +{
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
 +                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
 +                    nsteps_cmdline);
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int      cutoff_scheme; /* The cutoff-scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             const char *nbpu_opt,
 +             int nsteps_cmdline, int nstepout,int resetstep,
 +             int nmultisim,int repl_ex_nst,int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool   bForceUseGPU,bTryUseGPU;
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
 +    int         nthreads_pme=1;
 +    int         nthreads_pp=1;
 +    gmx_membed_t membed=NULL;
 +    gmx_hw_info_t *hwinfo=NULL;
 +    master_inf_t minf={-1,FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    snew(state,1);
 +    if (SIMMASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
 +        }
 +
 +        /* Detect hardware, gather information. With tMPI only thread 0 does it
 +         * and after threads are started broadcasts hwinfo around. */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
 +                                  inputrec,mtop,state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr,fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf),&minf,cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nbpu_opt,
 +                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Initializing membed");
 +        }
 +        membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
 +     * to the other threads  -- slightly uncool, but works fine, just need to
 +     * make sure that the data doesn't get freed twice. */
 +    if (cr->nnodes > 1)
 +    {
 +        if (!SIMMASTER(cr))
 +        {
 +            snew(hwinfo, 1);
 +        }
 +        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
 +    }
 +#else
 +    if (PAR(cr) && !SIMMASTER(cr))
 +    {
 +        /* now we have inputrec on all nodes, can run the detection */
 +        /* TODO: perhaps it's better to propagate within a node instead? */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +    }
 +#endif
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* remove when vv and rerun works correctly! */
 +    if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
 +    {
 +        gmx_fatal(FARGS,
 +                  "Currently can't do velocity verlet with rerun in parallel.");
 +    }
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(nfile,fnm,Flags,cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
 +    /* Initialize per-node process ID and counters. */
 +    gmx_init_intra_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr,fplog,"Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes==1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes==1 ? "process" : "processes"
 +#endif
 +                  );
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is 
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),
 +                      nbpu_opt,
 +                      FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
++        vsite = init_vsite(mtop,cr,FALSE);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With the number of TMPI threads equal to the number of cores
 +     * we already pinned in thread-MPI, so don't pin again here.
 +     */
 +    if (hw_opt->nthreads_tmpi != tMPI_Thread_get_hw_number())
 +#endif
 +    {
 +        /* Set the CPU affinity */
 +        set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md
 +#ifdef GMX_OPENMM
 +        ||
 +        integrator[inputrec->eI].func == do_md_openmm
 +#endif
 +        )
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               nthreads_pp, 
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (PAR(cr) && SIMMASTER(cr))
 +#endif
 +    {
 +        gmx_hardware_info_free(hwinfo);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    } 
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}