From: Roland Schulz <roland@utk.edu>
Date: Mon, 21 Jan 2013 01:12:10 +0000 (-0500)
Subject: Merge release-4-6 into master
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=e8fdd15ab94a0315827c35d16c5946e1a7b86a02;p=alexxy%2Fgromacs.git

Merge release-4-6 into master

Conflicts:
	CMakeLists.txt (just version)
	src/gmxlib/confio.c (just whitspace)

Moved gmx_thread_affinity.[ch]. Removed from it the visibility.

Doesn't include copyright and unrustify commits

Made by:
git merge -s ours 95761d63fb705b4334acbe2ce6ae2f384b4bcadb
git merge 66ff3d6e8bf65cce7b3008e65719e333da5e83e3
git merge -s ours 3339ffd6be2736541b928446de39ce51293449c6
git merge origin/release-4-6
tree=$(git log -1 HEAD --pretty=%T)
git reset --hard $(git cat-file commit HEAD | sed '1,/^$/d' | \
    git commit-tree $tree -p 03d5ecd -p bb6ea98)

Change-Id: Ibce56da97e270cf3f399cfdc89684917afc96a33
---

e8fdd15ab94a0315827c35d16c5946e1a7b86a02
diff --cc src/gromacs/gmxlib/confio.c
index 0af17e2bb4,0000000000..4167a51b50
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/confio.c
+++ b/src/gromacs/gmxlib/confio.c
@@@ -1,1449 -1,0 +1,1450 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include <errno.h>
 +#include "macros.h"
 +#include "string2.h"
 +#include "confio.h"
 +#include "vec.h"
 +#include "symtab.h"
 +#include "futil.h"
 +#include "xdrf.h"
 +#include "filenm.h"
 +#include "pdbio.h"
 +#include "tpxio.h"
 +#include "gmx_fatal.h"
 +#include "copyrite.h"
 +#include "filenm.h"
 +#include "statutil.h"
 +#include "pbc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +
 +#define CHAR_SHIFT 24
 +
 +static int read_g96_pos(char line[],t_symtab *symtab,
 +			FILE *fp,const char *infile,
 +			t_trxframe *fr)
 +{
 +  t_atoms *atoms;
 +  gmx_bool   bEnd;
 +  int    nwanted,natoms,atnr,resnr=0,oldres,newres,shift;
 +  char   anm[STRLEN],resnm[STRLEN];
 +  char   c1,c2;
 +  double db1,db2,db3;
 +  
 +  nwanted = fr->natoms;
 +
 +  atoms = fr->atoms;
 +
 +  natoms = 0;
 +
 +  if (fr->bX) {
 +    if (fr->bAtoms)
 +      shift = CHAR_SHIFT;
 +    else
 +      shift = 0;
 +    newres  = -1;
 +    oldres  = -666; /* Unlikely number for the first residue! */
 +    bEnd    = FALSE;
 +    while (!bEnd && fgets2(line,STRLEN,fp)) {
 +      bEnd = (strncmp(line,"END",3) == 0);
 +      if (!bEnd  && (line[0] != '#')) {
 +	if (sscanf(line+shift,"%15lf%15lf%15lf",&db1,&db2,&db3) != 3)
 +	  gmx_fatal(FARGS,"Did not find 3 coordinates for atom %d in %s\n",
 +		      natoms+1,infile);
 +	if ((nwanted != -1) && (natoms >= nwanted))
 +	  gmx_fatal(FARGS,
 +		      "Found more coordinates (%d) in %s than expected %d\n",
 +		      natoms,infile,nwanted);
 +	if (atoms) {
 +	  if (fr->bAtoms &&
 +	      (sscanf(line,"%5d%c%5s%c%5s%7d",&resnr,&c1,resnm,&c2,anm,&atnr) 
 +	       != 6)) {
 +	    if (oldres>=0)
 +	      resnr = oldres;
 +	    else {
 +	      resnr    = 1;
 +	      strncpy(resnm,"???",sizeof(resnm)-1); 
 +	    }
 +	    strncpy(anm,"???",sizeof(anm)-1); 
 +	  }
 +	  atoms->atomname[natoms]=put_symtab(symtab,anm);
 +	  if (resnr != oldres) {
 +	    oldres = resnr;
 +	    newres++;
 +	    if (newres >= atoms->nr)
 +	      gmx_fatal(FARGS,"More residues than atoms in %s (natoms = %d)",
 +			  infile,atoms->nr);
 +	    atoms->atom[natoms].resind = newres;
 +	    if (newres+1 > atoms->nres) {
 +	      atoms->nres = newres+1;
 +	    }
 +	    t_atoms_set_resinfo(atoms,natoms,symtab,resnm,resnr,' ',0,' ');
 +	  } else {
 +	    atoms->atom[natoms].resind = newres;
 +	  }
 +	}
 +	if (fr->x) {
 +	  fr->x[natoms][0] = db1;
 +	  fr->x[natoms][1] = db2;
 +	  fr->x[natoms][2] = db3;
 +	}
 +	natoms++;
 +      }
 +    }
 +    if ((nwanted != -1) && natoms != nwanted)
 +      fprintf(stderr,
 +	      "Warning: found less coordinates (%d) in %s than expected %d\n",
 +	      natoms,infile,nwanted);
 +  }
 +
 +  fr->natoms = natoms;
 +
 +  return natoms;
 +}
 +
 +static int read_g96_vel(char line[],FILE *fp,const char *infile,
 +			t_trxframe *fr)
 +{
 +  gmx_bool   bEnd;
 +  int    nwanted,natoms=-1,shift;
 +  double db1,db2,db3;
 +
 +  nwanted = fr->natoms;
 +
 +  if (fr->v && fr->bV) {
 +    if (strcmp(line,"VELOCITYRED") == 0)
 +      shift = 0;
 +    else
 +      shift = CHAR_SHIFT;
 +    natoms = 0;
 +    bEnd = FALSE;
 +    while (!bEnd && fgets2(line,STRLEN,fp)) {
 +      bEnd = (strncmp(line,"END",3) == 0);
 +      if (!bEnd && (line[0] != '#')) {
 +	if (sscanf(line+shift,"%15lf%15lf%15lf",&db1,&db2,&db3) != 3)
 +	  gmx_fatal(FARGS,"Did not find 3 velocities for atom %d in %s",
 +		      natoms+1,infile);
 +	if ((nwanted != -1) && (natoms >= nwanted))
 +	  gmx_fatal(FARGS,"Found more velocities (%d) in %s than expected %d\n",
 +		      natoms,infile,nwanted);
 +	if (fr->v) {
 +	  fr->v[natoms][0] = db1;
 +	  fr->v[natoms][1] = db2;
 +	  fr->v[natoms][2] = db3;
 +	}
 +	natoms++;
 +      }
 +    }
 +    if ((nwanted != -1) && (natoms != nwanted))
 +      fprintf(stderr,
 +	      "Warning: found less velocities (%d) in %s than expected %d\n",
 +	      natoms,infile,nwanted);
 +  }
 +  
 +  return natoms;
 +}
 +
 +int read_g96_conf(FILE *fp,const char *infile,t_trxframe *fr, char *line)
 +{
 +  t_symtab *symtab=NULL;
 +  gmx_bool   bAtStart,bTime,bAtoms,bPos,bVel,bBox,bEnd,bFinished;
 +  int    natoms,nbp;
 +  double db1,db2,db3,db4,db5,db6,db7,db8,db9;
 +
 +  bAtStart = (ftell(fp) == 0);
 +
 +  clear_trxframe(fr,FALSE);
 +  
 +  if (!symtab) {
 +    snew(symtab,1);
 +    open_symtab(symtab);
 +  }
 +  
 +  natoms=0;
 +
 +  if (bAtStart) {
 +    while ( !fr->bTitle && fgets2(line,STRLEN,fp))
 +      fr->bTitle = (strcmp(line,"TITLE") == 0);
 +    if (fr->title == NULL) {
 +      fgets2(line,STRLEN,fp);
 +      fr->title = strdup(line);
 +    }
 +    bEnd = FALSE;
 +    while (!bEnd && fgets2(line,STRLEN,fp))
 +      bEnd = (strcmp(line,"END") == 0);
 +    fgets2(line,STRLEN,fp);
 +  }
 +  
 +  /* Do not get a line if we are not at the start of the file, *
 +   * because without a parameter file we don't know what is in *
 +   * the trajectory and we have already read the line in the   *
 +   * previous call (VERY DIRTY).                               */
 +  bFinished = FALSE;
 +  do {
 +    bTime  = (strcmp(line,"TIMESTEP") == 0);
 +    bAtoms = (strcmp(line,"POSITION") == 0);
 +    bPos   = (bAtoms || (strcmp(line,"POSITIONRED") == 0));
 +    bVel   = (strncmp(line,"VELOCITY",8) == 0);
 +    bBox   = (strcmp(line,"BOX") == 0);
 +    if (bTime) {
 +      if (!fr->bTime && !fr->bX) {
 +	fr->bStep = bTime;
 +	fr->bTime = bTime;
 +	do 
 +	  bFinished = (fgets2(line,STRLEN,fp) == NULL);
 +	while (!bFinished && (line[0] == '#'));
 +	sscanf(line,"%15d%15lf",&(fr->step),&db1);
 +	fr->time = db1;
 +      } else
 +	bFinished = TRUE;
 +    }
 +    if (bPos) {
 +      if (!fr->bX) {
 +	fr->bAtoms = bAtoms;
 +	fr->bX     = bPos;
 +	natoms = read_g96_pos(line,symtab,fp,infile,fr);
 +      } else
 +	bFinished = TRUE;
 +    }
 +    if (fr->v && bVel) {
 +      fr->bV = bVel;
 +      natoms = read_g96_vel(line,fp,infile,fr);
 +    }
 +    if (bBox) {
 +      fr->bBox = bBox;
 +      clear_mat(fr->box);
 +      bEnd = FALSE;
 +      while (!bEnd && fgets2(line,STRLEN,fp)) {
 +	bEnd = (strncmp(line,"END",3) == 0);
 +	if (!bEnd && (line[0] != '#')) {
 +	  nbp = sscanf(line,"%15lf%15lf%15lf%15lf%15lf%15lf%15lf%15lf%15lf",
 +		       &db1,&db2,&db3,&db4,&db5,&db6,&db7,&db8,&db9);
 +	  if (nbp < 3)
 +	    gmx_fatal(FARGS,"Found a BOX line, but no box in %s",infile);
 +	  fr->box[XX][XX] = db1;
 +	  fr->box[YY][YY] = db2;
 +	  fr->box[ZZ][ZZ] = db3;
 +	  if (nbp == 9) {
 +	    fr->box[XX][YY] = db4;
 +	    fr->box[XX][ZZ] = db5;
 +	    fr->box[YY][XX] = db6;
 +	    fr->box[YY][ZZ] = db7;
 +	    fr->box[ZZ][XX] = db8;
 +	    fr->box[ZZ][YY] = db9; 
 +	  }
 +	}
 +      }
 +      bFinished = TRUE;
 +    }
 +  } while (!bFinished && fgets2(line,STRLEN,fp));
 +  
 +  free_symtab(symtab);
 +
 +  fr->natoms = natoms;
 +  
 +  return natoms;
 +}
 +
 +void write_g96_conf(FILE *out,t_trxframe *fr,
 +                    int nindex,const atom_id *index)
 +{
 +  t_atoms *atoms;
 +  int nout,i,a;
 +  
 +  atoms = fr->atoms;
 +
 +  if (index)
 +    nout = nindex;
 +  else
 +    nout = fr->natoms; 
 +
 +  if (fr->bTitle)
 +    fprintf(out,"TITLE\n%s\nEND\n",fr->title);
 +  if (fr->bStep || fr->bTime)
 +    /* Officially the time format is %15.9, which is not enough for 10 ns */
 +    fprintf(out,"TIMESTEP\n%15d%15.6f\nEND\n",fr->step,fr->time);
 +  if (fr->bX) {
 +    if (fr->bAtoms) {
 +      fprintf(out,"POSITION\n");
 +      for(i=0; i<nout; i++) {
 +	if (index) a = index[i]; else a = i;
 +	fprintf(out,"%5d %-5s %-5s%7d%15.9f%15.9f%15.9f\n",
 +		(atoms->resinfo[atoms->atom[a].resind].nr) % 100000,
 +		*atoms->resinfo[atoms->atom[a].resind].name,
 +		*atoms->atomname[a],(i+1) % 10000000,
 +		fr->x[a][XX],fr->x[a][YY],fr->x[a][ZZ]);
 +      }
 +    } else {
 +      fprintf(out,"POSITIONRED\n");
 +      for(i=0; i<nout; i++) {
 +	if (index) a = index[i]; else a = i;
 +	fprintf(out,"%15.9f%15.9f%15.9f\n",
 +		fr->x[a][XX],fr->x[a][YY],fr->x[a][ZZ]);
 +      }
 +    }
 +    fprintf(out,"END\n");
 +  }
 +  if (fr->bV) {
 +    if (fr->bAtoms) {
 +      fprintf(out,"VELOCITY\n");
 +      for(i=0; i<nout; i++) {
 +	if (index) a = index[i]; else a = i;
 +	fprintf(out,"%5d %-5s %-5s%7d%15.9f%15.9f%15.9f\n",
 +		(atoms->resinfo[atoms->atom[a].resind].nr) % 100000,
 +		*atoms->resinfo[atoms->atom[a].resind].name,
 +		*atoms->atomname[a],(i+1) % 10000000,
 +		fr->v[a][XX],fr->v[a][YY],fr->v[a][ZZ]);
 +      }
 +    } else {
 +      fprintf(out,"VELOCITYRED\n");
 +      for(i=0; i<nout; i++) {
 +	if (index) a = index[i]; else a = i;
 +	fprintf(out,"%15.9f%15.9f%15.9f\n",
 +		fr->v[a][XX],fr->v[a][YY],fr->v[a][ZZ]);
 +      }
 +    }
 +    fprintf(out,"END\n");
 +  }
 +  if (fr->bBox) {
 +    fprintf(out,"BOX\n");
 +    fprintf(out,"%15.9f%15.9f%15.9f",
 +	    fr->box[XX][XX],fr->box[YY][YY],fr->box[ZZ][ZZ]);
 +    if (fr->box[XX][YY] || fr->box[XX][ZZ] || fr->box[YY][XX] ||
 +	fr->box[YY][ZZ] || fr->box[ZZ][XX] ||fr->box[ZZ][YY])
 +      fprintf(out,"%15.9f%15.9f%15.9f%15.9f%15.9f%15.9f",
 +	      fr->box[XX][YY],fr->box[XX][ZZ],fr->box[YY][XX],
 +	      fr->box[YY][ZZ],fr->box[ZZ][XX],fr->box[ZZ][YY]);
 +    fprintf(out,"\n");
 +    fprintf(out,"END\n");
 +  }
 +}
 +
 +static int get_espresso_word(FILE *fp,char word[])
 +{
 +  int  ret,nc,i;
 +  
 +  ret = 0;
 +  nc = 0;
 +  
 +  do {
 +    i = fgetc(fp);
 +    if (i != EOF) {
 +      if (i == ' ' || i == '\n' || i == '\t') {
 +	if (nc > 0)
 +	  ret = 1;
 +      } else if (i == '{') {
 +	if (nc == 0)
 +	  word[nc++] = '{';
 +	ret = 2;
 +      } else if (i == '}') {
 +	if (nc == 0)
 +	  word[nc++] = '}';
 +	ret = 3;
 +      } else {
 +	word[nc++] = (char)i;
 +      }
 +    }
 +  } while (i != EOF && ret == 0);
 +  
 +  word[nc] = '\0';
 +
 +  /*  printf("'%s'\n",word); */
 +  
 +  return ret;
 +}
 +
 +static int check_open_parenthesis(FILE *fp,int r,
 +				  const char *infile,const char *keyword)
 +{
 +  int level_inc;
 +  char word[STRLEN];
 +
 +  level_inc = 0;
 +  if (r == 2) {
 +    level_inc++;
 +  } else {
 +    r = get_espresso_word(fp,word);
 +    if (r == 2)
 +      level_inc++;
 +    else
 +      gmx_fatal(FARGS,"Expected '{' after '%s' in file '%s'",
 +		keyword,infile);
 +  }
 +
 +  return level_inc;
 +}
 +
 +static int check_close_parenthesis(FILE *fp,int r,
 +				  const char *infile,const char *keyword)
 +{
 +  int level_inc;
 +  char word[STRLEN];
 +  
 +  level_inc = 0;
 +  if (r == 3) {
 +    level_inc--;
 +  } else {
 +    r = get_espresso_word(fp,word);
 +    if (r == 3)
 +      level_inc--;
 +    else
 +      gmx_fatal(FARGS,"Expected '}' after section '%s' in file '%s'",
 +		keyword,infile);
 +  }
 +
 +  return level_inc;
 +}
 +
 +enum { espID, espPOS, espTYPE, espQ, espV, espF, espMOLECULE, espNR };
 +const char *esp_prop[espNR] = { "id", "pos", "type", "q", "v", "f",
 +				"molecule" };
 +
 +static void read_espresso_conf(const char *infile,
 +			       t_atoms *atoms,rvec x[],rvec *v,matrix box)
 +{
 +  t_symtab *symtab=NULL;
 +  FILE *fp;
 +  char word[STRLEN],buf[STRLEN];
 +  int  natoms,level,npar,r,nprop,p,i,m,molnr;
 +  int  prop[32];
 +  double d;
 +  gmx_bool bFoundParticles,bFoundProp,bFoundVariable,bMol;
 +
 +  if (!symtab) {
 +    snew(symtab,1);
 +    open_symtab(symtab);
 +  }
 +
 +  clear_mat(box);
 +  
 +  fp = gmx_fio_fopen(infile,"r");
 +  
 +  bFoundParticles = FALSE;
 +  bFoundVariable = FALSE;
 +  bMol = FALSE;
 +  level = 0;
 +  while ((r=get_espresso_word(fp,word))) {
 +    if (level==1 && strcmp(word,"particles")==0 && !bFoundParticles) {
 +      bFoundParticles = TRUE;
 +      level += check_open_parenthesis(fp,r,infile,"particles");
 +      nprop = 0;
 +      while (level == 2 && (r=get_espresso_word(fp,word))) {
 +	bFoundProp = FALSE;
 +	for(p=0; p<espNR; p++) {
 +	  if (strcmp(word,esp_prop[p]) == 0) {
 +	    bFoundProp = TRUE;
 +	    prop[nprop++] = p;
 +	    /* printf("  prop[%d] = %s\n",nprop-1,esp_prop[prop[nprop-1]]); */
 +	  }
 +	}
 +	if (!bFoundProp && word[0] != '}') {
 +	  gmx_fatal(FARGS,"Can not read Espresso files with particle property '%s'",word);
 +	}
 +	if (bFoundProp && p == espMOLECULE)
 +	  bMol = TRUE;
 +	if (r == 3)
 +	  level--;
 +      }
 +      
 +      i = 0;
 +      while (level > 0 && (r=get_espresso_word(fp,word))) {
 +	if (r == 2) {
 +	  level++;
 +	} else if (r == 3) {
 +	  level--;
 +	}
 +	if (level == 2) {
 +	  for(p=0; p<nprop; p++) {
 +	    switch (prop[p]) {
 +	    case espID:
 +	      r = get_espresso_word(fp,word);
 +	      /* Not used */
 +	      break;
 +	    case espPOS:
 +	      for(m=0; m<3; m++) {
 +		r = get_espresso_word(fp,word);
 +		sscanf(word,"%lf",&d);
 +		x[i][m] = d;
 +	      }
 +	      break;
 +	    case espTYPE:
 +	      r = get_espresso_word(fp,word);
 +	      atoms->atom[i].type = strtol(word, NULL, 10);  
 +	      break;
 +	    case espQ:
 +	      r = get_espresso_word(fp,word);
 +	      sscanf(word,"%lf",&d);
 +	      atoms->atom[i].q = d;
 +	      break;
 +	    case espV:
 +	      for(m=0; m<3; m++) {
 +		r = get_espresso_word(fp,word);
 +		sscanf(word,"%lf",&d);
 +		v[i][m] = d;
 +	      }
 +	      break;
 +	    case espF:
 +	      for(m=0; m<3; m++) {
 +		r = get_espresso_word(fp,word);
 +		/* not used */
 +	      }
 +	      break;
 +	    case espMOLECULE:
 +	      r = get_espresso_word(fp,word);
 +	      molnr = strtol(word, NULL, 10);
 +	      if (i == 0 ||
 +		  atoms->resinfo[atoms->atom[i-1].resind].nr != molnr) {
 +		atoms->atom[i].resind =
 +		  (i == 0 ? 0 : atoms->atom[i-1].resind+1); 
 +		atoms->resinfo[atoms->atom[i].resind].nr      = molnr;
 +		atoms->resinfo[atoms->atom[i].resind].ic      = ' ';
 +		atoms->resinfo[atoms->atom[i].resind].chainid = ' ';
 +        atoms->resinfo[atoms->atom[i].resind].chainnum = molnr; /* Not sure if this is right? */
 +	      } else {
 +		atoms->atom[i].resind = atoms->atom[i-1].resind;
 +	      }
 +	      break;
 +	    }
 +	  }
 +	  /* Generate an atom name from the particle type */
 +	  sprintf(buf,"T%d",atoms->atom[i].type);
 +	  atoms->atomname[i] = put_symtab(symtab,buf);
 +	  if (bMol) {
 +	    if (i == 0 || atoms->atom[i].resind != atoms->atom[i-1].resind) {
 +	      atoms->resinfo[atoms->atom[i].resind].name =
 +		put_symtab(symtab,"MOL");
 +	    }
 +	  } else {
 +	    /* Residue number is the atom number */
 +	    atoms->atom[i].resind = i;
 +	    /* Generate an residue name from the particle type */
 +	    if (atoms->atom[i].type < 26) {
 +	      sprintf(buf,"T%c",'A'+atoms->atom[i].type);
 +	    } else {
 +	      sprintf(buf,"T%c%c",
 +		      'A'+atoms->atom[i].type/26,'A'+atoms->atom[i].type%26);
 +	    }
 +	    t_atoms_set_resinfo(atoms,i,symtab,buf,i,' ',0,' ');
 +	  }	  
 +
 +	  if (r == 3)
 +	    level--;
 +	  i++;
 +	}
 +      }
 +      atoms->nres = atoms->nr;
 +
 +      if (i != atoms->nr) {
 +	gmx_fatal(FARGS,"Internal inconsistency in Espresso routines, read %d atoms, expected %d atoms",i,atoms->nr);
 +      }
 +    } else if (level==1 && strcmp(word,"variable")==0 && !bFoundVariable) {
 +      bFoundVariable = TRUE;
 +      level += check_open_parenthesis(fp,r,infile,"variable");
 +      while (level==2 && (r=get_espresso_word(fp,word))) {
 +	if (level==2 && strcmp(word,"box_l") == 0) {
 +	  for(m=0; m<3; m++) {
 +	    r = get_espresso_word(fp,word);
 +	    sscanf(word,"%lf",&d);
 +	    box[m][m] = d;
 +	  }
 +	  level += check_close_parenthesis(fp,r,infile,"box_l");
 +	}
 +      }
 +    } else if (r == 2) {
 +      level++;
 +    } else if (r == 3) {
 +      level--;
 +    }
 +  }
 +  
 +  if (!bFoundParticles) {
 +    fprintf(stderr,"Did not find a particles section in Espresso file '%s'\n",
 +	    infile);
 +  }
 +  
 +  gmx_fio_fclose(fp);
 +}
 +
 +static int get_espresso_coordnum(const char *infile)
 +{
 +  FILE *fp;
 +  char word[STRLEN];
 +  int  natoms,level,r;
 +  gmx_bool bFoundParticles;
 +
 +  natoms = 0;
 +  
 +  fp = gmx_fio_fopen(infile,"r");
 +  
 +  bFoundParticles = FALSE;
 +  level = 0;
 +  while ((r=get_espresso_word(fp,word)) && !bFoundParticles) {
 +    if (level==1 && strcmp(word,"particles")==0 && !bFoundParticles) {
 +      bFoundParticles = TRUE;
 +      level += check_open_parenthesis(fp,r,infile,"particles");
 +      while (level > 0 && (r=get_espresso_word(fp,word))) {
 +	if (r == 2) {
 +	  level++;
 +	  if (level == 2)
 +	    natoms++;
 +	} else if (r == 3) {
 +	  level--;
 +	}
 +      }
 +    } else if (r == 2) {
 +      level++;
 +    } else if (r == 3) {
 +      level--;
 +    }
 +  }
 +  if (!bFoundParticles) {
 +    fprintf(stderr,"Did not find a particles section in Espresso file '%s'\n",
 +	    infile);
 +  }
 +  
 +  gmx_fio_fclose(fp);
 +  
 +  return natoms;
 +}
 +
 +static void write_espresso_conf_indexed(FILE *out,const char *title,
 +					t_atoms *atoms,int nx,atom_id *index,
 +					rvec *x,rvec *v,matrix box)
 +{
 +  int i,j;
 +
 +  fprintf(out,"# %s\n",title);
 +  if (TRICLINIC(box)) {
 +    gmx_warning("The Espresso format does not support triclinic unit-cells");
 +  }
 +  fprintf(out,"{variable {box_l %f %f %f}}\n",box[0][0],box[1][1],box[2][2]);
 +  
 +  fprintf(out,"{particles {id pos type q%s}\n",v ? " v" : "");
 +  for(i=0; i<nx; i++) {
 +    if (index)
 +      j = index[i];
 +    else
 +      j = i;
 +    fprintf(out,"\t{%d %f %f %f %d %g",
 +	    j,x[j][XX],x[j][YY],x[j][ZZ],
 +	    atoms->atom[j].type,atoms->atom[j].q);
 +    if (v)
 +      fprintf(out," %f %f %f",v[j][XX],v[j][YY],v[j][ZZ]);
 +    fprintf(out,"}\n");
 +  }
 +  fprintf(out,"}\n");
 +}
 +
 +static void get_coordnum_fp (FILE *in, char *title, int *natoms)
 +{
 +  char line[STRLEN+1];
 +
 +  fgets2 (title,STRLEN,in);
 +  fgets2 (line,STRLEN,in);
 +  if (sscanf (line,"%d",natoms) != 1) {
 +    gmx_fatal(FARGS,"gro file does not have the number of atoms on the second line");
 +  }
 +}
 +
 +static void get_coordnum (const char *infile,int *natoms)
 +{
 +  FILE *in;
 +  char title[STRLEN];
 +  
 +  in=gmx_fio_fopen(infile,"r");
 +  get_coordnum_fp(in,title,natoms);
 +  gmx_fio_fclose (in);
 +}
 +
 +static gmx_bool get_w_conf(FILE *in, const char *infile, char *title,
 +                           t_symtab *symtab, t_atoms *atoms, int *ndec,
 +                           rvec x[], rvec *v, matrix box)
 +{
 +  char   name[6];
 +  char   line[STRLEN+1],*ptr;
 +  char   buf[256];
 +  double x1,y1,z1,x2,y2,z2;
 +  rvec   xmin,xmax;
 +  int    natoms,i,m,resnr,newres,oldres,ddist,c;
 +  gmx_bool   bFirst,bVel;
 +  char   *p1,*p2,*p3;
 +  
 +  newres  = -1;
 +  oldres  = NOTSET; /* Unlikely number for the first residue! */
 +  ddist   = 0;
 +  
 +  /* Read the title and number of atoms */
 +  get_coordnum_fp(in,title,&natoms);
 +
 +  if (natoms > atoms->nr)
 +    gmx_fatal(FARGS,"gro file contains more atoms (%d) than expected (%d)",
 +		natoms,atoms->nr);
 +  else if (natoms <  atoms->nr)
 +    fprintf(stderr,"Warning: gro file contains less atoms (%d) than expected"
 +	    " (%d)\n",natoms,atoms->nr);
 +  
 +  bFirst=TRUE;
 +  
 +  bVel = FALSE;
 +
 +  /* just pray the arrays are big enough */
 +  for (i=0; (i < natoms) ; i++) {
 +    if ((fgets2 (line,STRLEN,in)) == NULL) {
 +      unexpected_eof(infile,i+2);
 +    }
 +    if (strlen(line) < 39)
 +      gmx_fatal(FARGS,"Invalid line in %s for atom %d:\n%s",infile,i+1,line);
 +
 +    /* determine read precision from distance between periods 
 +       (decimal points) */
 +    if (bFirst) {
 +      bFirst=FALSE;
 +      p1=strchr(line,'.');
 +      if (p1 == NULL)
 +	gmx_fatal(FARGS,"A coordinate in file %s does not contain a '.'",infile);
 +      p2=strchr(&p1[1],'.');
 +      if (p2 == NULL)
 +	gmx_fatal(FARGS,"A coordinate in file %s does not contain a '.'",infile);
 +      ddist = p2 - p1;
 +      *ndec = ddist - 5;
 +
 +      p3=strchr(&p2[1],'.');
 +      if (p3 == NULL)
 +	gmx_fatal(FARGS,"A coordinate in file %s does not contain a '.'",infile);
 +
 +      if (p3 - p2 != ddist)
 +	gmx_fatal(FARGS,"The spacing of the decimal points in file %s is not consistent for x, y and z",infile);
 +    }
 +    
 +    /* residue number*/
 +    memcpy(name,line,5);
 +    name[5]='\0';
 +    sscanf(name,"%d",&resnr);
 +    memcpy(name,line+5,5);
 +    name[5]='\0';
 +    if (resnr != oldres) {
 +      oldres = resnr;
 +      newres++;
 +      if (newres >= natoms)
 +	gmx_fatal(FARGS,"More residues than atoms in %s (natoms = %d)",
 +		    infile,natoms);
 +      atoms->atom[i].resind = newres;
 +      t_atoms_set_resinfo(atoms,i,symtab,name,resnr,' ',0,' ');
 +    } else {
 +      atoms->atom[i].resind = newres;
 +    }
 +
 +    /* atomname */
 +    memcpy(name,line+10,5);
 +    atoms->atomname[i]=put_symtab(symtab,name);
 +   
 +    /* eventueel controle atomnumber met i+1 */
 +
 +    /* coordinates (start after residue shit) */
 +    ptr = line + 20;
 +    /* Read fixed format */
 +    for(m=0; m<DIM; m++) {
 +      for(c=0; (c<ddist && ptr[0]); c++) {
 +	buf[c] = ptr[0];
 +	ptr++;
 +      }
 +      buf[c] = '\0';
 +      if (sscanf (buf,"%lf %lf",&x1,&x2) != 1) {
 +	gmx_fatal(FARGS,"Something is wrong in the coordinate formatting of file %s. Note that gro is fixed format (see the manual)",infile);
 +      } else {
 +	x[i][m] = x1;
 +      }
 +    }
 +
 +    /* velocities (start after residues and coordinates) */
 +    if (v) {
 +      /* Read fixed format */
 +      for(m=0; m<DIM; m++) {
 +	for(c=0; (c<ddist && ptr[0]); c++) {
 +	  buf[c] = ptr[0];
 +	  ptr++;
 +	}
 +	buf[c] = '\0';
 +	if (sscanf (buf,"%lf",&x1) != 1) {
 +	  v[i][m] = 0;
 +	} else {
 +	  v[i][m] = x1;
 +	  bVel = TRUE;
 +	}
 +      }
 +    }
 +  }
 +  atoms->nres = newres + 1;
 +
 +  /* box */
 +  fgets2 (line,STRLEN,in);
 +  if (sscanf (line,"%lf%lf%lf",&x1,&y1,&z1) != 3) {
 +    gmx_warning("Bad box in file %s",infile);
 +    
 +    /* Generate a cubic box */
 +    for(m=0; (m<DIM); m++)
 +      xmin[m]=xmax[m]=x[0][m];
 +    for(i=1; (i<atoms->nr); i++)
 +      for(m=0; (m<DIM); m++) {
 +	xmin[m]=min(xmin[m],x[i][m]);
 +	xmax[m]=max(xmax[m],x[i][m]);
 +      }
 +    for (i=0; i<DIM; i++) for (m=0; m<DIM; m++) box[i][m]=0.0;
 +    for(m=0; (m<DIM); m++)
 +      box[m][m]=(xmax[m]-xmin[m]);
 +    fprintf(stderr,"Generated a cubic box %8.3f x %8.3f x %8.3f\n",
 +	    box[XX][XX],box[YY][YY],box[ZZ][ZZ]);
 +  }
 +  else {
 +    /* We found the first three values, the diagonal elements */
 +    box[XX][XX]=x1;
 +    box[YY][YY]=y1;
 +    box[ZZ][ZZ]=z1;
 +    if (sscanf (line,"%*f%*f%*f%lf%lf%lf%lf%lf%lf",
 +		&x1,&y1,&z1,&x2,&y2,&z2) != 6) 
 +      x1=y1=z1=x2=y2=z2=0.0;
 +    box[XX][YY] = x1;
 +    box[XX][ZZ] = y1;
 +    box[YY][XX] = z1;
 +    box[YY][ZZ] = x2;
 +    box[ZZ][XX] = y2;
 +    box[ZZ][YY] = z2;
 +  }
 +
 +  return bVel;
 +}
 +
 +static void read_whole_conf(const char *infile,char *title,
 +			    t_atoms *atoms, rvec x[],rvec *v, matrix box)
 +{
 +  FILE   *in;
 +  int    ndec;
 +  t_symtab symtab;
 +
 +  /* open file */
 +  in=gmx_fio_fopen(infile,"r");
 +
 +  open_symtab(&symtab);
 +  get_w_conf(in, infile, title, &symtab, atoms, &ndec, x, v, box);
 +  /* We can't free the symbols, as they are still used in atoms, so
 +   * the only choice is to leak them. */
 +  free_symtab(&symtab);
 +
 +  gmx_fio_fclose(in);
 +}
 +
 +gmx_bool gro_next_x_or_v(FILE *status,t_trxframe *fr)
 +{
 +  t_atoms atoms;
 +  t_symtab symtab;
 +  char    title[STRLEN],*p;
 +  double  tt;
 +  int     ndec=0,i;
 +
 +  if (gmx_eof(status))
 +    return FALSE;
 +
 +  open_symtab(&symtab);
 +  atoms.nr=fr->natoms;
 +  snew(atoms.atom,fr->natoms);
 +  atoms.nres=fr->natoms;
 +  snew(atoms.resinfo,fr->natoms);
 +  snew(atoms.atomname,fr->natoms);
 +  
 +  fr->bV = get_w_conf(status,title,title,&symtab,&atoms,&ndec,fr->x,fr->v,fr->box);
 +  fr->bPrec = TRUE;
 +  fr->prec = 1;
 +  /* prec = 10^ndec: */
 +  for(i=0; i<ndec; i++)
 +    fr->prec *= 10;
 +  fr->title = title;
 +  fr->bTitle = TRUE;
 +  fr->bX = TRUE;
 +  fr->bBox = TRUE;
 +
 +  sfree(atoms.atom);
 +  sfree(atoms.resinfo);
 +  sfree(atoms.atomname);
 +  done_symtab(&symtab);
 +
 +  if ((p=strstr(title,"t=")) != NULL) {
 +    p+=2;
 +    if (sscanf(p,"%lf",&tt)==1) {
 +      fr->time = tt;
 +      fr->bTime = TRUE;
 +    } else {
 +      fr->time = 0;
 +      fr->bTime = FALSE;
 +    }
 +  }
 +  
 +  if (atoms.nr != fr->natoms)
 +    gmx_fatal(FARGS,"Number of atoms in gro frame (%d) doesn't match the number in the previous frame (%d)",atoms.nr,fr->natoms);
 +  
 +  return TRUE;
 +}
 +
 +int gro_first_x_or_v(FILE *status,t_trxframe *fr)
 +{
 +  char title[STRLEN];
 +  
 +  frewind(status);
 +  fprintf(stderr,"Reading frames from gro file");
 +  get_coordnum_fp(status, title, &fr->natoms);
 +  frewind(status);
 +  fprintf(stderr," '%s', %d atoms.\n",title, fr->natoms);
 +  fr->bTitle = TRUE;
 +  fr->title = title;
 +  if (fr->natoms==0)
 +    gmx_file("No coordinates in gro file");
 +  
 +  snew(fr->x,fr->natoms);
 +  snew(fr->v,fr->natoms);
 +  gro_next_x_or_v(status, fr);
 +  
 +  return fr->natoms;
 +}
 +
 +static void make_hconf_format(int pr,gmx_bool bVel,char format[])
 +{
 +  int l,vpr;
 +
 +  /* build format string for printing, 
 +     something like "%8.3f" for x and "%8.4f" for v */
 +  if (pr<0)
 +    pr=0;
 +  if (pr>30)
 +    pr=30;
 +  l=pr+5;
 +  vpr=pr+1;
 +  if (bVel)
 +    sprintf(format,"%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df\n",
 +	    l,pr,l,pr,l,pr,l,vpr,l,vpr,l,vpr);
 +  else
 +    sprintf(format,"%%%d.%df%%%d.%df%%%d.%df\n",l,pr,l,pr,l,pr);
 +
 +}
 +
 +static void write_hconf_box(FILE *out,int pr,matrix box)
 +{
 +  char format[100];
 +  int l;
 +
 +  if (pr<5) 
 +    pr=5;
 +  l=pr+5;
 +  
 +  if (box[XX][YY] || box[XX][ZZ] || box[YY][XX] || box[YY][ZZ] ||
 +      box[ZZ][XX] || box[ZZ][YY]) {
 +    sprintf(format,"%%%d.%df%%%d.%df%%%d.%df"
 +	    "%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df\n",
 +	    l,pr,l,pr,l,pr,l,pr,l,pr,l,pr,l,pr,l,pr,l,pr);
 +    fprintf(out,format,
 +	    box[XX][XX],box[YY][YY],box[ZZ][ZZ],
 +	    box[XX][YY],box[XX][ZZ],box[YY][XX],
 +	    box[YY][ZZ],box[ZZ][XX],box[ZZ][YY]);
 +  } else {
 +    sprintf(format,"%%%d.%df%%%d.%df%%%d.%df\n",l,pr,l,pr,l,pr);
 +    fprintf(out,format,
 +	    box[XX][XX],box[YY][YY],box[ZZ][ZZ]);
 +  }
 +}
 +
 +void write_hconf_indexed_p(FILE *out,const char *title,t_atoms *atoms,
 +			   int nx,const atom_id index[], int pr,
 +			   rvec *x,rvec *v,matrix box)
 +{
 +  char resnm[6],nm[6],format[100];
 +  int  ai,i,resind,resnr;
 +
 +  bromacs(format,99);
 +  fprintf (out,"%s\n",(title && title[0])?title:format);
 +  fprintf (out,"%5d\n",nx);
 +
 +  make_hconf_format(pr,v!=NULL,format);
 +  
 +  for (i=0; (i<nx); i++) {
 +    ai=index[i];
 +    
 +    resind = atoms->atom[ai].resind;
 +    strncpy(resnm," ??? ",sizeof(resnm)-1);
 +    if (resind < atoms->nres) {
 +      strncpy(resnm,*atoms->resinfo[resind].name,sizeof(resnm)-1);
 +      resnr = atoms->resinfo[resind].nr;
 +    } else {
 +      strncpy(resnm," ??? ",sizeof(resnm)-1);
 +      resnr = resind + 1;
 +    }
 +    
 +    if (atoms->atom)
 +      strncpy(nm,*atoms->atomname[ai],sizeof(nm)-1);
 +    else
 +      strncpy(nm," ??? ",sizeof(nm)-1);
 +
 +    fprintf(out,"%5d%-5.5s%5.5s%5d",resnr%100000,resnm,nm,(ai+1)%100000);
 +    /* next fprintf uses built format string */
 +    if (v)
 +      fprintf(out,format,
 +	      x[ai][XX], x[ai][YY], x[ai][ZZ], v[ai][XX],v[ai][YY],v[ai][ZZ]);
 +    else
 +      fprintf(out,format,
 +	      x[ai][XX], x[ai][YY], x[ai][ZZ]);
 +  }
 +
 +  write_hconf_box(out,pr,box);
 +
 +  fflush(out);
 +}
 +
 +static void write_hconf_mtop(FILE *out,const char *title,gmx_mtop_t *mtop,
 +			     int pr,
 +			     rvec *x,rvec *v,matrix box)
 +{
 +  char format[100];
 +  int  i,resnr;
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_atom *atom;
 +  char *atomname,*resname;
 +
 +  bromacs(format,99);
 +  fprintf (out,"%s\n",(title && title[0])?title:format);
 +  fprintf (out,"%5d\n",mtop->natoms);
 +
 +  make_hconf_format(pr,v!=NULL,format);
 +  
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +    gmx_mtop_atomloop_all_names(aloop,&atomname,&resnr,&resname);
 +
 +    fprintf(out,"%5d%-5.5s%5.5s%5d",
 +	    resnr%100000,resname,atomname,(i+1)%100000);
 +    /* next fprintf uses built format string */
 +    if (v)
 +      fprintf(out,format,
 +	      x[i][XX], x[i][YY], x[i][ZZ], v[i][XX],v[i][YY],v[i][ZZ]);
 +    else
 +      fprintf(out,format,
 +	      x[i][XX], x[i][YY], x[i][ZZ]);
 +  }
 +
 +  write_hconf_box(out,pr,box);
 +
 +  fflush(out);
 +}
 +
 +void write_hconf_p(FILE *out,const char *title,t_atoms *atoms, int pr,
 +		   rvec *x,rvec *v,matrix box)
 +{
 +  atom_id *aa;
 +  int     i;
 +  
 +  snew(aa,atoms->nr);
 +  for(i=0; (i<atoms->nr); i++)
 +    aa[i]=i;
 +  write_hconf_indexed_p(out,title,atoms,atoms->nr,aa,pr,x,v,box);
 +  sfree(aa);
 +}
 +
 +void write_conf_p(const char *outfile, const char *title,
 +		  t_atoms *atoms, int pr,
 +		  rvec *x, rvec *v,matrix box)
 +{
 +  FILE *out;
 +
 +  out=gmx_fio_fopen(outfile,"w");
 +  write_hconf_p(out,title,atoms,pr,x,v,box);
 +
 +  gmx_fio_fclose (out);
 +}
 +
 +static void write_conf(const char *outfile, const char *title, t_atoms *atoms,
 +		       rvec *x, rvec *v,matrix box)
 +{
 +  write_conf_p(outfile, title, atoms, 3, x, v, box);
 +}
 +
 +void write_sto_conf_indexed(const char *outfile,const char *title,
 +			    t_atoms *atoms, 
 +			    rvec x[],rvec *v,int ePBC,matrix box,
 +			    atom_id nindex,atom_id index[])
 +{
 +  FILE       *out;
 +  int        ftp;
 +  t_trxframe fr;
 +
 +  ftp=fn2ftp(outfile);
 +  switch (ftp) {
 +  case efGRO:
 +    out=gmx_fio_fopen(outfile,"w");
 +    write_hconf_indexed_p(out, title, atoms, nindex, index, 3, x, v, box);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efG96:
 +    clear_trxframe(&fr,TRUE);
 +    fr.bTitle = TRUE;
 +    fr.title = title;
 +    fr.natoms = atoms->nr;
 +    fr.bAtoms = TRUE;
 +    fr.atoms = atoms;
 +    fr.bX = TRUE;
 +    fr.x = x;
 +    if (v) {
 +      fr.bV = TRUE;
 +      fr.v = v;
 +    }
 +    fr.bBox = TRUE;
 +    copy_mat(box,fr.box);
 +    out=gmx_fio_fopen(outfile,"w");
 +    write_g96_conf(out, &fr, nindex, index);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efPDB:
 +  case efBRK:
 +  case efENT:
 +  case efPQR:
 +    out=gmx_fio_fopen(outfile,"w");
 +    write_pdbfile_indexed(out,title,atoms,x,ePBC,box,' ',-1,nindex,index,NULL,TRUE);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efESP:
 +    out=gmx_fio_fopen(outfile,"w"); 
 +    write_espresso_conf_indexed(out, title, atoms, nindex, index, x, v, box);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efTPR:
 +  case efTPB:
 +  case efTPA:
 +    gmx_fatal(FARGS,"Sorry, can not write a topology to %s",outfile);
 +    break;
 +  default:
 +    gmx_incons("Not supported in write_sto_conf_indexed");
 +  }
 +}
 +
 +static void write_xyz_conf(const char *outfile,const char *title,
 +			   t_atoms *atoms,rvec *x)
 +{
 +  FILE *fp;
 +  int i,anr;
 +  real value;
 +  char *ptr,*name;
 +  gmx_atomprop_t aps=gmx_atomprop_init();
 +  
 +  fp = gmx_fio_fopen(outfile,"w");
 +  fprintf(fp,"%3d\n",atoms->nr);
 +  fprintf(fp,"%s\n",title);
 +  for(i=0; (i<atoms->nr); i++) {
 +    anr  = atoms->atom[i].atomnumber;
 +    name = *atoms->atomname[i];
 +    if (anr == NOTSET) {
 +      if (gmx_atomprop_query(aps,epropElement,"???",name,&value))
 +	anr = gmx_nint(value);
 +    }
 +    if ((ptr = gmx_atomprop_element(aps,anr)) == NULL)
 +      ptr = name;
 +    fprintf(fp,"%3s%10.5f%10.5f%10.5f\n",ptr,
 +	    10*x[i][XX],10*x[i][YY],10*x[i][ZZ]);
 +  }
 +  gmx_fio_fclose(fp);
 +  gmx_atomprop_destroy(aps);
 +}
 +
 +void write_sto_conf(const char *outfile,const char *title,t_atoms *atoms, 
 +		    rvec x[],rvec *v,int ePBC,matrix box)
 +{
 +  FILE       *out;
 +  int        ftp;
 +  t_trxframe fr;
 +  
 +  ftp=fn2ftp(outfile);
 +  switch (ftp) {
 +  case efGRO:
 +    write_conf(outfile, title, atoms, x, v, box);
 +    break;
 +  case efG96:
 +    clear_trxframe(&fr,TRUE);
 +    fr.bTitle = TRUE;
 +    fr.title = title;
 +    fr.natoms = atoms->nr;
 +    fr.bAtoms = TRUE;
 +    fr.atoms = atoms;
 +    fr.bX = TRUE;
 +    fr.x = x;
 +    if (v) {
 +      fr.bV = TRUE;
 +      fr.v = v;
 +    }
 +    fr.bBox = TRUE;
 +    copy_mat(box,fr.box);
 +    out=gmx_fio_fopen(outfile,"w");
 +    write_g96_conf(out, &fr, -1, NULL);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efXYZ:
 +    write_xyz_conf(outfile,(strlen(title) > 0) ? title : outfile,atoms,x);
 +    break;
 +  case efPDB:
 +  case efBRK:
 +  case efENT:
 +    out=gmx_fio_fopen(outfile,"w");
 +    write_pdbfile(out, title, atoms, x, ePBC, box, ' ', -1,NULL,TRUE);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efESP:
 +    out=gmx_fio_fopen(outfile,"w");
 +    write_espresso_conf_indexed(out, title, atoms, atoms->nr, NULL, x, v, box);
 +    gmx_fio_fclose(out);
 +    break;
 +  case efTPR:
 +  case efTPB:
 +  case efTPA:
 +    gmx_fatal(FARGS,"Sorry, can not write a topology to %s",outfile);
 +    break;
 +  default:
 +    gmx_incons("Not supported in write_sto_conf");
 +  }
 +}
 +
 +void write_sto_conf_mtop(const char *outfile,const char *title,
 +			 gmx_mtop_t *mtop,
 +			 rvec x[],rvec *v,int ePBC,matrix box)
 +{
 +  int  ftp;
 +  FILE *out;
 +  t_atoms atoms;
 +
 +  ftp=fn2ftp(outfile);
 +  switch (ftp) {
 +  case efGRO:
 +    out = gmx_fio_fopen(outfile,"w");
 +    write_hconf_mtop(out,title,mtop,3,x,v,box);
 +    gmx_fio_fclose(out);
 +    break;
 +  default:
 +    /* This is a brute force approach which requires a lot of memory.
 +     * We should implement mtop versions of all writing routines.
 +     */
 +    atoms = gmx_mtop_global_atoms(mtop);
 +    
 +    write_sto_conf(outfile,title,&atoms,x,v,ePBC,box);
 +    
 +    done_atom(&atoms);
 +    break;
 +  }
 +}
 +
 +static int get_xyz_coordnum(const char *infile)
 +{
 +  FILE *fp;
 +  int n;
 +  
 +  fp = gmx_fio_fopen(infile,"r");
 +  if (fscanf(fp,"%d",&n) != 1)
 +    gmx_fatal(FARGS,"Can not read number of atoms from %s",infile);
 +  gmx_fio_fclose(fp);
 +  
 +  return n;
 +}
 +
 +static void read_xyz_conf(const char *infile,char *title,
 +			  t_atoms *atoms,rvec *x)
 +{
 +  FILE   *fp;
 +  int    i,n;
 +  double xx,yy,zz;
 +  t_symtab *tab;
 +  char atomnm[32],buf[STRLEN];
 +  
 +  snew(tab,1);
 +  fp = gmx_fio_fopen(infile,"r");
 +  fgets2(buf,STRLEN-1,fp);
 +  if (sscanf(buf,"%d",&n) != 1)
 +    gmx_fatal(FARGS,"Can not read number of atoms from %s",infile);
 +  fgets2(buf,STRLEN-1,fp);
 +  strcpy(title,buf);
 +  for(i=0; (i<n); i++) {
 +    fgets2(buf,STRLEN-1,fp);
 +    if (sscanf(buf,"%s%lf%lf%lf",atomnm,&xx,&yy,&zz) != 4)
 +      gmx_fatal(FARGS,"Can not read coordinates from %s",infile);
 +    atoms->atomname[i] = put_symtab(tab,atomnm);
 +    x[i][XX] = xx*0.1;
 +    x[i][YY] = yy*0.1;
 +    x[i][ZZ] = zz*0.1;
 +  }
 +  gmx_fio_fclose(fp);
 +}
 +
 +void get_stx_coordnum(const char *infile,int *natoms)
 +{
 +  FILE *in;
 +  int ftp,tpxver,tpxgen;
 +  t_trxframe fr;
 +  char g96_line[STRLEN+1];
 +
 +  ftp=fn2ftp(infile);
 +  range_check(ftp,0,efNR);
 +  switch (ftp) {
 +  case efGRO:
 +    get_coordnum(infile, natoms);
 +    break;
 +  case efG96:
 +    in=gmx_fio_fopen(infile,"r");
 +    fr.title = NULL;
 +    fr.natoms = -1;
 +    fr.atoms = NULL;
 +    fr.x = NULL;
 +    fr.v = NULL;
 +    fr.f = NULL;
 +    *natoms=read_g96_conf(in,infile,&fr,g96_line);
 +    gmx_fio_fclose(in);
 +    break;
 +  case efXYZ:
 +    *natoms = get_xyz_coordnum(infile);
 +    break;
 +  case efPDB:
 +  case efBRK:
 +  case efENT:
 +    in=gmx_fio_fopen(infile,"r");
 +    get_pdb_coordnum(in, natoms);
 +    gmx_fio_fclose(in);
 +    break;
 +  case efESP:
 +    *natoms = get_espresso_coordnum(infile);
 +    break;
 +  case efTPA:
 +  case efTPB:
 +  case efTPR: {
 +    t_tpxheader tpx;
 +    
 +    read_tpxheader(infile,&tpx,TRUE,&tpxver,&tpxgen);
 +    *natoms = tpx.natoms;
 +    break;
 +  }
 +  default:
 +    gmx_fatal(FARGS,"File type %s not supported in get_stx_coordnum",
 +	      ftp2ext(ftp));
 +  }
 +}
 +
 +void read_stx_conf(const char *infile,char *title,t_atoms *atoms, 
 +		   rvec x[],rvec *v,int *ePBC,matrix box)
 +{
 +  FILE       *in;
 +  char       buf[256];
 +  gmx_mtop_t *mtop;
 +  t_topology top;
 +  t_trxframe fr;
 +  int        i,ftp,natoms;
 +  real       d;
 +  char       g96_line[STRLEN+1];
 +
 +  if (atoms->nr == 0)
 +    fprintf(stderr,"Warning: Number of atoms in %s is 0\n",infile);
 +  else if (atoms->atom == NULL)
 +    gmx_mem("Uninitialized array atom");
 +  
 +  if (ePBC)
 +    *ePBC = -1;
 +
 +  ftp=fn2ftp(infile);
 +  switch (ftp) {
 +  case efGRO:
 +    read_whole_conf(infile, title, atoms, x, v, box);
 +    break;
 +  case efXYZ:
 +    read_xyz_conf(infile,title,atoms,x);
 +    break;
 +  case efG96:
-     fr.title = title;
++    fr.title  = NULL;
 +    fr.natoms = atoms->nr;
 +    fr.atoms = atoms;
 +    fr.x = x;
 +    fr.v = v;
 +    fr.f = NULL;
 +    in = gmx_fio_fopen(infile,"r");
 +    read_g96_conf(in, infile, &fr, g96_line);
 +    gmx_fio_fclose(in);
 +    copy_mat(fr.box,box);
++    strncpy(title, fr.title, STRLEN);
 +    break;
 +  case efPDB:
 +  case efBRK:
 +  case efENT:
 +    read_pdb_conf(infile, title, atoms, x, ePBC, box, TRUE, NULL);
 +    break;
 +  case efESP:
 +    read_espresso_conf(infile,atoms,x,v,box);
 +    break;
 +  case efTPR:
 +  case efTPB:
 +  case efTPA: 
 +    snew(mtop,1);
 +    i = read_tpx(infile,NULL,box,&natoms,x,v,NULL,mtop);
 +    if (ePBC)
 +      *ePBC = i;
 +    
 +    strcpy(title,*(mtop->name));
 +    
 +    /* Free possibly allocated memory */
 +    done_atom(atoms);
 +    
 +    *atoms = gmx_mtop_global_atoms(mtop);
 +    top = gmx_mtop_t_to_t_topology(mtop);
 +    tpx_make_chain_identifiers(atoms,&top.mols);
 +		
 +    sfree(mtop);
 +    /* The strings in the symtab are still in use in the returned t_atoms
 +     * structure, so we should not free them. But there is no place to put the
 +     * symbols; the only choice is to leak the memory...
 +     * So we clear the symbol table before freeing the topology structure. */
 +    free_symtab(&top.symtab);
 +    done_top(&top);
 +		  
 +    break;
 +  default:
 +    gmx_incons("Not supported in read_stx_conf");
 +  }
 +}
 +
diff --cc src/gromacs/gmxlib/gmx_omp.c
index 29622e9a7e,0000000000..a06c43d902
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/gmx_omp.c
+++ b/src/gromacs/gmxlib/gmx_omp.c
@@@ -1,179 -1,0 +1,182 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include "config.h"
 +#endif
 +
 +#ifdef GMX_OPENMP
 +#include <omp.h>
 +#endif
 +
 +#include <stdio.h>
 +
 +#include "md_logging.h"
 +#include "gmx_fatal.h"
 +#include "statutil.h"
 +#include "string2.h"
 +#include "gmx_omp.h"
 +
 +int gmx_omp_get_max_threads(void)
 +{
 +#ifdef GMX_OPENMP
 +    return omp_get_max_threads();
 +#else
 +    return 1;
 +#endif
 +}
 +
 +int gmx_omp_get_num_procs(void)
 +{
 +#ifdef GMX_OPENMP
 +    return omp_get_num_procs();
 +#else
 +    return 1;
 +#endif
 +}
 +
 +int gmx_omp_get_thread_num(void)
 +{
 +#ifdef GMX_OPENMP
 +    return omp_get_thread_num();
 +#else
 +    return 0;
 +#endif
 +}
 +
 +void gmx_omp_set_num_threads(int num_threads)
 +{
 +#ifdef GMX_OPENMP
 +    omp_set_num_threads(num_threads);
 +#else
 +    return;
 +#endif
 +}
 +
 +/*!
 + * Thread affinity set by the OpenMP library can conflict with the GROMACS
 + * internal affinity setting.
 + *
 + * While GNU OpenMP does not set affinity by default, the Intel OpenMP library
 + * does. This conflicts with the internal affinity (especially thread-MPI)
 + * setting, results in incorrectly locked threads, and causes dreadful performance.
 + *
 + * The KMP_AFFINITY environment variable is used by Intel, GOMP_CPU_AFFINITY
 + * by the GNU compilers (Intel also honors it well). If any of the variables
 + * is set, we honor it, disable the internal pinning, and warn the user.
 + * When using Intel OpenMP, we will disable affinity if the user did not set it
 + * anually through one of the aforementioned environment variables.
 + *
 + * Note that the Intel OpenMP affinity disabling iwll only take effect if this
 + * function is called before the OpenMP library gets initialized which happens
 + * when the first call is made into a compilation unit that contains OpenMP
 + * pragmas.
 + */
 +void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr,
 +                                   gmx_hw_opt_t *hw_opt)
 +{
 +    gmx_bool bKmpAffinitySet, bGompCpuAffinitySet;
 +    char *kmp_env, *gomp_env;
 +
 +    /* no need to worry if internal thread pinning is turned off */
-     if (!hw_opt->bThreadPinning)
++    if (hw_opt->thread_affinity == threadaffOFF)
 +    {
 +        return;
 +    }
 +
 +#if defined(GMX_OPENMP)
 +
 +    /* We assume that the affinity setting is available on all platforms
 +     * gcc supports. Even if this is not the case (e.g. Mac OS) the user
 +     * will only get a warning.*/
 +    bGompCpuAffinitySet = FALSE;
 +    gomp_env = NULL;
 +#if defined(__GNUC__)
 +    gomp_env = getenv("GOMP_CPU_AFFINITY");
 +    bGompCpuAffinitySet = (gomp_env != NULL);
 +#endif /* __GNUC__ */
 +
 +    bKmpAffinitySet = FALSE;
 +#if defined(__INTEL_COMPILER)
 +    kmp_env = getenv("KMP_AFFINITY");
 +    bKmpAffinitySet = (kmp_env != NULL);
 +
 +    /* disable Intel OpenMP affinity if neither KMP_AFFINITY nor
 +     * GOMP_CPU_AFFINITY is set (Intel uses the GNU env. var as well) */
 +    if (!bKmpAffinitySet && !bGompCpuAffinitySet)
 +    {
 +        int retval;
 +
 +#ifdef _MSC_VER
 +        /* Windows not POSIX */
 +        retval = _putenv_s("KMP_AFFINITY", "disabled");
 +#else
 +        /* POSIX */
 +        retval = setenv("KMP_AFFINITY", "disabled", 0);
 +#endif /* _MSC_VER */
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Disabling Intel OpenMP affinity by setting the KMP_AFFINITY=disabled env. var.\n");
 +        }
 +
 +        if (retval != 0)
 +        {
 +            gmx_warning("Disabling Intel OpenMp affinity setting failed!");
 +        }
 +    }
 +
 +    /* turn off internal pinning KMP_AFFINITY != "disabled" */
 +    if (bKmpAffinitySet && (gmx_strncasecmp(kmp_env, "disabled", 8) != 0))
 +    {
-         md_print_warn(cr, fplog, "WARNING: KMP_AFFINITY set, will turn off %s internal affinity\n"
-                       "         setting as the two can conflict and cause performance degradation.\n"
-                       "         To keep using the %s internal affinity setting, set the\n"
-                       "         KMP_AFFINITY=disabled environment variable.",
++        /* TODO: with -pin auto we should only warn when using all cores */
++        md_print_warn(cr, fplog,
++                      "NOTE: KMP_AFFINITY set, will turn off %s internal affinity\n"
++                      "      setting as the two can conflict and cause performance degradation.\n"
++                      "      To keep using the %s internal affinity setting, set the\n"
++                      "      KMP_AFFINITY=disabled environment variable.",
 +                      ShortProgram(), ShortProgram());
 +
-         hw_opt->bThreadPinning = FALSE;
++        hw_opt->thread_affinity = threadaffOFF;
 +    }
 +#endif /* __INTEL_COMPILER */
 +
 +#if defined(__INTEL_COMPILER) || defined(__GNUC__)
 +    /* turn off internal pinning f GOMP_CPU_AFFINITY is set & non-empty */
 +    if (bGompCpuAffinitySet && gomp_env != NULL && gomp_env != '\0')
 +    {
++        /* TODO: with -pin auto we should only warn when using all cores */
 +        md_print_warn(cr, fplog,
-                       "WARNING: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n"
-                       "         setting as the two can conflict and cause performance degradation.\n"
-                       "         To keep using the %s internal affinity setting, unset the\n"
-                       "         GOMP_CPU_AFFINITY environment variable.",
++                      "NOTE: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n"
++                      "      setting as the two can conflict and cause performance degradation.\n"
++                      "      To keep using the %s internal affinity setting, unset the\n"
++                      "      GOMP_CPU_AFFINITY environment variable.",
 +                      ShortProgram(), ShortProgram());
 +
-         hw_opt->bThreadPinning = FALSE;
++        hw_opt->thread_affinity = threadaffOFF;
 +    }
 +#endif /* __INTEL_COMPILER || __GNUC__ */
 +
 +#endif /* GMX_OPENMP */
 +}
diff --cc src/gromacs/gmxlib/gmx_thread_affinity.c
index 0000000000,0000000000..c67b961a2a
new file mode 100644
--- /dev/null
+++ b/src/gromacs/gmxlib/gmx_thread_affinity.c
@@@ -1,0 -1,0 +1,430 @@@
++/*
++ * This file is part of the GROMACS molecular simulation package.
++ *
++ * Copyright (c) 2012, by the GROMACS development team, led by
++ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
++ * others, as listed in the AUTHORS file in the top-level source
++ * directory and at http://www.gromacs.org.
++ *
++ * GROMACS is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public License
++ * as published by the Free Software Foundation; either version 2.1
++ * of the License, or (at your option) any later version.
++ *
++ * GROMACS is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with GROMACS; if not, see
++ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
++ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
++ *
++ * If you want to redistribute modifications to GROMACS, please
++ * consider that scientific software is very special. Version
++ * control is crucial - bugs must be traceable. We will be happy to
++ * consider code for inclusion in the official distribution, but
++ * derived work must not be called official GROMACS. Details are found
++ * in the README & COPYING files - if they are missing, get the
++ * official version at http://www.gromacs.org.
++ *
++ * To help us fund GROMACS development, we humbly ask that you cite
++ * the research papers on the package. Check out http://www.gromacs.org.
++ */
++#ifdef HAVE_CONFIG_H
++#include <config.h>
++#endif
++#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
++#define _GNU_SOURCE
++#include <sched.h>
++#include <sys/syscall.h>
++#endif
++#include <assert.h>
++#include <stdio.h>
++#include "typedefs.h"
++#include "types/commrec.h"
++#include "types/hw_info.h"
++#include "gmx_cpuid.h"
++#include "gmx_omp.h"
++#include "gmx_omp_nthreads.h"
++#include "mdrun.h"
++#include "md_logging.h"
++#include "statutil.h"
++#include "gmx_thread_affinity.h"
++
++#include "thread_mpi/threads.h"
++
++
++static int
++get_thread_affinity_layout(FILE *fplog,
++                           const t_commrec *cr,
++                           const gmx_hw_info_t * hwinfo,
++                           int nthreads,
++                           int pin_offset, int * pin_stride,
++                           const int **locality_order)
++{
++    int         nhwthreads,npkg,ncores,nhwthreads_per_core,rc;
++    const int * pkg_id;
++    const int * core_id;
++    const int * hwthread_id;
++
++    if (pin_offset < 0)
++    {
++        gmx_fatal(FARGS,"Negative thread pinning offset requested");
++    }
++    if (*pin_stride < 0)
++    {
++        gmx_fatal(FARGS,"Negative thread pinning stride requested");
++    }
++
++    rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores,
++                            &nhwthreads_per_core,
++                            &pkg_id, &core_id, &hwthread_id, locality_order);
++
++    if (rc != 0)
++    {
++        nhwthreads      = hwinfo->nthreads_hw_avail;
++        *locality_order = NULL;
++
++        if (nhwthreads <= 0)
++        {
++            /* We don't know anything about the hardware, don't pin */
++            md_print_warn(cr, fplog,
++                          "We don't know how many logical cores we have, will not pin threads");
++
++            return -1;
++        }
++    }
++
++    if (pin_offset + nthreads > nhwthreads)
++    {
++        /* We are oversubscribing, don't pin */
++        md_print_warn(NULL, fplog,
++                      "More threads requested than available logical cores, will not pin threads");
++
++        return -1;
++    }
++
++    /* Check if we need to choose the pinning stride */
++    if (*pin_stride == 0)
++    {
++        if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads)
++        {
++            /* Put one thread on each physical core */
++            *pin_stride = nhwthreads_per_core;
++        }
++        else
++        {
++            /* We don't know if we have SMT, and if we do, we don't know
++             * if hw threads in the same physical core are consecutive.
++             * Without SMT the pinning layout should not matter too much.
++             * so we assume a consecutive layout and maximally spread out"
++             * the threads at equal threads per core.
++             * Note that IBM is the major non-x86 case with cpuid support
++             * and probably threads are already pinned by the queuing system,
++             * so we wouldn't end up here in the first place.
++             */
++            *pin_stride = (nhwthreads - pin_offset)/nthreads;
++        }
++
++        if (fplog != NULL)
++        {
++            fprintf(fplog,"Pinning threads with a logical core stride of %d\n",
++                    *pin_stride);
++        }
++    }
++    else
++    {
++        if (pin_offset + nthreads*(*pin_stride) > nhwthreads)
++        {
++            /* We are oversubscribing, don't pin */
++            md_print_warn(NULL, fplog,
++                          "The requested pinning stride is too large for the available logical cores, will not pin threads");
++
++            return -1;
++        }
++    }
++
++    return 0;
++}
++
++/* Set CPU affinity. Can be important for performance.
++   On some systems (e.g. Cray) CPU Affinity is set by default.
++   But default assigning doesn't work (well) with only some ranks
++   having threads. This causes very low performance.
++   External tools have cumbersome syntax for setting affinity
++   in the case that only some ranks have threads.
++   Thus it is important that GROMACS sets the affinity internally
++   if only PME is using threads.
++*/
++void
++gmx_set_thread_affinity(FILE *fplog,
++                        const t_commrec *cr,
++                        gmx_hw_opt_t *hw_opt,
++                        int nthreads_pme,
++                        const gmx_hw_info_t *hwinfo,
++                        const t_inputrec *inputrec)
++{
++    int nth_affinity_set, thread_id_node, thread_id,
++        nthread_local, nthread_node, nthread_hw_max, nphyscore;
++    int offset;
++    const int *locality_order;
++    int rc;
++
++    if (hw_opt->thread_affinity == threadaffOFF)
++    {
++        /* Nothing to do */
++        return;
++    }
++
++#ifndef __APPLE__
++    /* If the tMPI thread affinity setting is not supported encourage the user
++     * to report it as it's either a bug or an exotic platform which we might
++     * want to support. */
++    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
++    {
++        md_print_warn(NULL, fplog,
++                      "Can not set thread affinities on the current platform. On NUMA systems this\n"
++                      "can cause performance degradation. If you think your platform should support\n"
++                      "setting affinities, contact the GROMACS developers.");
++        return;
++    }
++#endif /* __APPLE__ */
++
++    /* threads on this MPI process or TMPI thread */
++    if (cr->duty & DUTY_PP)
++    {
++        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
++    }
++    else
++    {
++        nthread_local = gmx_omp_nthreads_get(emntPME);
++    }
++
++    /* map the current process to cores */
++    thread_id_node = 0;
++    nthread_node = nthread_local;
++#ifdef GMX_MPI
++    if (PAR(cr) || MULTISIM(cr))
++    {
++        /* We need to determine a scan of the thread counts in this
++         * compute node.
++         */
++        MPI_Comm comm_intra;
++
++        MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
++                       &comm_intra);
++        MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
++        /* MPI_Scan is inclusive, but here we need exclusive */
++        thread_id_node -= nthread_local;
++        /* Get the total number of threads on this physical node */
++        MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
++        MPI_Comm_free(&comm_intra);
++    }
++#endif
++
++    if (hw_opt->thread_affinity == threadaffAUTO &&
++        nthread_node != hwinfo->nthreads_hw_avail)
++    {
++        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
++        {
++            md_print_warn(cr, fplog,
++                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
++                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
++                          "      This can lead to significant performance degradation.\n"
++                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
++        }
++
++        return;
++    }
++
++    offset = 0;
++    if (hw_opt->core_pinning_offset != 0)
++    {
++        offset = hw_opt->core_pinning_offset;
++        md_print_info(cr,fplog,"Applying core pinning offset %d\n", offset);
++    }
++
++    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
++                                    nthread_node,
++                                    offset, &hw_opt->core_pinning_stride,
++                                    &locality_order);
++    if (rc != 0)
++    {
++        /* Incompatible layout, don't pin, warning was already issued */
++        return;
++    }
++
++    /* Set the per-thread affinity. In order to be able to check the success
++     * of affinity settings, we will set nth_affinity_set to 1 on threads
++     * where the affinity setting succeded and to 0 where it failed.
++     * Reducing these 0/1 values over the threads will give the total number
++     * of threads on which we succeeded.
++     */
++    nth_affinity_set = 0;
++#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
++                     reduction(+:nth_affinity_set)
++    {
++        int      index,core;
++        gmx_bool setaffinity_ret;
++
++        thread_id       = gmx_omp_get_thread_num();
++        thread_id_node += thread_id;
++        index           = offset + thread_id_node*hw_opt->core_pinning_stride;
++        if (locality_order != NULL)
++        {
++            core = locality_order[index];
++        }
++        else
++        {
++            core = index;
++        }
++
++        setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
++
++        /* store the per-thread success-values of the setaffinity */
++        nth_affinity_set = (setaffinity_ret == 0);
++
++        if (debug)
++        {
++            fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
++                    cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
++        }
++    }
++
++    if (nth_affinity_set > nthread_local)
++    {
++        char msg[STRLEN];
++
++        sprintf(msg, "Looks like we have set affinity for more threads than "
++                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
++        gmx_incons(msg);
++    }
++    else
++    {
++        /* check & warn if some threads failed to set their affinities */
++        if (nth_affinity_set != nthread_local)
++        {
++            char sbuf1[STRLEN], sbuf2[STRLEN];
++
++            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
++            sbuf1[0] = sbuf2[0] = '\0';
++#ifdef GMX_MPI
++#ifdef GMX_THREAD_MPI
++            sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
++#else /* GMX_LIB_MPI */
++            sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
++#endif
++#endif /* GMX_MPI */
++
++            if (nthread_local > 1)
++            {
++                sprintf(sbuf2, "of %d/%d thread%s ",
++                        nthread_local - nth_affinity_set, nthread_local,
++                        (nthread_local - nth_affinity_set) > 1 ? "s" : "");
++            }
++
++            md_print_warn(NULL, fplog,
++                          "NOTE: %sAffinity setting %sfailed.\n"
++                          "      This can cause performance degradation!",
++                          sbuf1, sbuf2);
++        }
++    }
++}
++
++/* Check the process affinity mask and if it is found to be non-zero,
++ * will honor it and disable mdrun internal affinity setting.
++ * Note that this will only work on Linux as we use a GNU feature.
++ */
++void
++gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
++                              gmx_hw_opt_t *hw_opt, int ncpus,
++                              gmx_bool bAfterOpenmpInit)
++{
++#ifdef HAVE_SCHED_GETAFFINITY
++    cpu_set_t mask_current;
++    int       i, ret, cpu_count, cpu_set;
++    gmx_bool  bAllSet;
++
++    assert(hw_opt);
++    if (hw_opt->thread_affinity == threadaffOFF)
++    {
++        /* internal affinity setting is off, don't bother checking process affinity */
++        return;
++    }
++
++    CPU_ZERO(&mask_current);
++    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
++    {
++        /* failed to query affinity mask, will just return */
++        if (debug)
++        {
++            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
++        }
++        return;
++    }
++
++    /* Before proceeding with the actual check, make sure that the number of
++     * detected CPUs is >= the CPUs in the current set.
++     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
++#ifdef CPU_COUNT
++    if (ncpus < CPU_COUNT(&mask_current))
++    {
++        if (debug)
++        {
++            fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
++                    ncpus, CPU_COUNT(&mask_current));
++        }
++        return;
++    }
++#endif /* CPU_COUNT */
++
++    bAllSet = TRUE;
++    for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
++    {
++        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
++    }
++
++    if (!bAllSet)
++    {
++        if (hw_opt->thread_affinity == threadaffAUTO)
++        {
++            if (!bAfterOpenmpInit)
++            {
++                md_print_warn(cr, fplog,
++                              "Non-default thread affinity set, disabling internal thread affinity");
++            }
++            else
++            {
++                md_print_warn(cr, fplog,
++                              "Non-default thread affinity set probably by the OpenMP library,\n"
++                              "disabling internal thread affinity");
++            }
++            hw_opt->thread_affinity = threadaffOFF;
++        }
++        else
++        {
++            /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
++            if (bAfterOpenmpInit)
++            {
++                md_print_warn(cr, fplog,
++                              "Overriding thread affinity set outside %s\n",
++                              ShortProgram());
++            }
++        }
++
++        if (debug)
++        {
++            fprintf(debug, "Non-default affinity mask found\n");
++        }
++    }
++    else
++    {
++        if (debug)
++        {
++            fprintf(debug, "Default affinity mask found\n");
++        }
++    }
++#endif /* HAVE_SCHED_GETAFFINITY */
++}
diff --cc src/gromacs/legacyheaders/gmx_thread_affinity.h
index 0000000000,0000000000..c8de7bf3e1
new file mode 100644
--- /dev/null
+++ b/src/gromacs/legacyheaders/gmx_thread_affinity.h
@@@ -1,0 -1,0 +1,77 @@@
++/*
++ * This file is part of the GROMACS molecular simulation package.
++ *
++ * Copyright (c) 2012, by the GROMACS development team, led by
++ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
++ * others, as listed in the AUTHORS file in the top-level source
++ * directory and at http://www.gromacs.org.
++ *
++ * GROMACS is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public License
++ * as published by the Free Software Foundation; either version 2.1
++ * of the License, or (at your option) any later version.
++ *
++ * GROMACS is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with GROMACS; if not, see
++ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
++ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
++ *
++ * If you want to redistribute modifications to GROMACS, please
++ * consider that scientific software is very special. Version
++ * control is crucial - bugs must be traceable. We will be happy to
++ * consider code for inclusion in the official distribution, but
++ * derived work must not be called official GROMACS. Details are found
++ * in the README & COPYING files - if they are missing, get the
++ * official version at http://www.gromacs.org.
++ *
++ * To help us fund GROMACS development, we humbly ask that you cite
++ * the research papers on the package. Check out http://www.gromacs.org.
++ */
++#ifndef GMX_THREAD_AFFINITY_H_
++#define GMX_THREAD_AFFINITY_H_
++#include "types/commrec.h"
++#include "typedefs.h"
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++#if 0
++} /* fixes auto-indentation problems */
++#endif
++
++/* Sets the thread affinity using the requested setting stored in hw_opt.
++ * The hardware topologu is requested from hwinfo, when present.
++ */
++void
++gmx_set_thread_affinity(FILE *fplog,
++                        const t_commrec *cr,
++                        gmx_hw_opt_t *hw_opt,
++                        int nthreads_pme,
++                        const gmx_hw_info_t *hwinfo,
++                        const t_inputrec *inputrec);
++
++/* Check the process affinity mask and if it is found to be non-zero,
++ * will honor it and disable mdrun internal affinity setting.
++ * This function should be called first before the OpenMP library gets
++ * initialized with the last argument FALSE (which will detect affinity
++ * set by external tools like taskset), and later, after the OpenMP
++ * initialization, with the last argument TRUE to detect affinity changes
++ * made by the OpenMP library.
++ *
++ * Note that this will only work on Linux as we use a GNU feature.
++ */
++void
++gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
++                              gmx_hw_opt_t *hw_opt, int ncpus,
++                              gmx_bool bAfterOpenmpInit);
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif /* GMX_THREAD_AFFINITY_H_ */
diff --cc src/gromacs/legacyheaders/mdrun.h
index 3c802d57f3,0000000000..3fb195f297
mode 100644,000000..100644
--- a/src/gromacs/legacyheaders/mdrun.h
+++ b/src/gromacs/legacyheaders/mdrun.h
@@@ -1,206 -1,0 +1,212 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _mdrun_h
 +#define _mdrun_h
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include "typedefs.h"
 +#include "network.h"
 +#include "sim_util.h"
 +#include "tgroup.h"
 +#include "filenm.h"
 +#include "mshift.h"
 +#include "force.h"
 +#include "edsam.h"
 +#include "mdebin.h"
 +#include "vcm.h"
 +#include "vsite.h"
 +#include "pull.h"
 +#include "update.h"
 +#include "types/membedt.h"
 +#include "types/globsig.h"
 +
 +
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi/threads.h"
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +#define MD_POLARISE       (1<<2)
 +#define MD_IONIZE         (1<<3)
 +#define MD_RERUN          (1<<4)
 +#define MD_RERUN_VSITE    (1<<5)
 +#define MD_FFSCAN         (1<<6)
 +#define MD_SEPPOT         (1<<7)
 +#define MD_PARTDEC        (1<<9)
 +#define MD_DDBONDCHECK    (1<<10)
 +#define MD_DDBONDCOMM     (1<<11)
 +#define MD_CONFOUT        (1<<12)
 +#define MD_REPRODUCIBLE   (1<<13)
 +#define MD_READ_RNG       (1<<14)
 +#define MD_APPENDFILES    (1<<15)
 +#define MD_APPENDFILESSET (1<<21)
 +#define MD_KEEPANDNUMCPT  (1<<16)
 +#define MD_READ_EKIN      (1<<17)
 +#define MD_STARTFROMCPT   (1<<18)
 +#define MD_RESETCOUNTERSHALFWAY (1<<19)
 +#define MD_TUNEPME        (1<<20)
 +#define MD_TESTVERLET     (1<<22)
 +
++/* The options for the domain decomposition MPI task ordering */
 +enum {
 +  ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
 +};
 +
++/* The options for the thread affinity setting, default: auto */
++enum {
++  threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
++};
++
 +typedef struct {
 +    int      nthreads_tot;        /* Total number of threads requested (TMPI) */
 +    int      nthreads_tmpi;       /* Number of TMPI threads requested         */
 +    int      nthreads_omp;        /* Number of OpenMP threads requested       */
 +    int      nthreads_omp_pme;    /* As nthreads_omp, but for PME only nodes  */
-     gmx_bool bThreadPinning;      /* Pin OpenMP threads to cores?             */
-     gmx_bool bPinHyperthreading;  /* Pin pairs of threads to physical cores   */
-     int      core_pinning_offset; /* Physical core pinning offset             */
++    int      thread_affinity;     /* Thread affinity switch, see enum above   */
++    int      core_pinning_stride; /* Logical core pinning stride              */
++    int      core_pinning_offset; /* Logical core pinning offset              */
 +    char    *gpu_id;              /* GPU id's to use, each specified as chars */
 +} gmx_hw_opt_t;
 +
 +/* Variables for temporary use with the deform option,
 + * used in runner.c and md.c.
 + * (These variables should be stored in the tpx file.)
 + */
 +extern gmx_large_int_t     deform_init_init_step_tpx;
 +extern matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +extern tMPI_Thread_mutex_t deform_init_box_mutex;
 +
 +/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
 + * the number of threads will get lowered.
 + */
 +#define MIN_ATOMS_PER_MPI_THREAD    90
 +#define MIN_ATOMS_PER_GPU           900
 +#endif
 +
 +
 +typedef double gmx_integrator_t(FILE *log,t_commrec *cr,
 +				int nfile,const t_filenm fnm[],
 +				const output_env_t oenv, gmx_bool bVerbose,
 +                                gmx_bool bCompact, int nstglobalcomm,
 +				gmx_vsite_t *vsite,gmx_constr_t constr,
 +				int stepout,
 +				t_inputrec *inputrec,
 +				gmx_mtop_t *mtop,t_fcdata *fcd,
 +				t_state *state,
 +				t_mdatoms *mdatoms,
 +				t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +				gmx_edsam_t ed, 
 +				t_forcerec *fr,
 +				int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                                gmx_membed_t membed,
 +				real cpt_period,real max_hours,
 +				const char *deviceOptions,
 +				unsigned long Flags,
 +				gmx_runtime_t *runtime);
 +
 +/* ROUTINES from md.c */
 +
 +gmx_integrator_t do_md;
 +
 +
 +/* ROUTINES from minimize.c */
 +
 +gmx_integrator_t do_steep;
 +/* Do steepest descents EM */
 +
 +gmx_integrator_t do_cg;
 +/* Do conjugate gradient EM */
 +
 +gmx_integrator_t do_lbfgs;
 +/* Do conjugate gradient L-BFGS */
 +
 +gmx_integrator_t do_nm;
 +/* Do normal mode analysis */
 +
 +/* ROUTINES from tpi.c */
 +
 +gmx_integrator_t do_tpi;
 +/* Do test particle insertion */
 +
 +void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit);
 +
 +int ExpandedEnsembleDynamics(FILE *log,t_inputrec *ir, gmx_enerdata_t *enerd,
 +                             t_state *state, t_extmass *MassQ, df_history_t *dfhist,
 +                             gmx_large_int_t step, gmx_rng_t mcrng,
 +                             rvec *v, t_mdatoms *mdatoms);
 +
 +void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
 +                               int nlam, int frequency, gmx_large_int_t step);
 +
 +void get_mc_state(gmx_rng_t rng,t_state *state);
 +
 +void set_mc_state(gmx_rng_t rng,t_state *state);
 +
 +/* check the version */
 +void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
 +                               t_inputrec *ir,gmx_mtop_t *mtop);
 +
 +/* Allocate and initialize node-local state entries. */
 +void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes);
 +
 +/* Broadcast the data for a simulation, and allocate node-specific settings
 +   such as rng generators. */
 +void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
 +                          gmx_mtop_t *mtop);
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +	     FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm, ivec ddxyz,int dd_node_order,
 +             real rdd, real rconstr, const char *dddlb_opt,real dlb_scale,
 +	     const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +	     const char *nbpu_opt,
 +	     int nsteps_cmdline, int nstepout, int resetstep,
 +	     int nmultisim, int repl_ex_nst, int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +	     const char *deviceOptions, unsigned long Flags);
 +/* Driver routine, that calls the different methods */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif	/* _mdrun_h */
diff --cc src/programs/mdrun/mdrun.c
index c801a2f684,0000000000..6286afeade
mode 100644,000000..100644
--- a/src/programs/mdrun/mdrun.c
+++ b/src/programs/mdrun/mdrun.c
@@@ -1,740 -1,0 +1,751 @@@
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "copyrite.h"
 +#include "main.h"
 +#include "statutil.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "smalloc.h"
 +#include "edsam.h"
 +#include "mdrun.h"
 +#include "xmdrun.h"
 +#include "checkpoint.h"
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi.h"
 +#endif
 +
 +/* afm stuf */
 +#include "pull.h"
 +
 +int cmain(int argc,char *argv[])
 +{
 +  const char *desc[] = {
 +    "The [TT]mdrun[tt] program is the main computational chemistry engine",
 +    "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
 +    "but it can also perform Stochastic Dynamics, Energy Minimization,",
 +    "test particle insertion or (re)calculation of energies.",
 +    "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
 +    "builds a Hessian matrix from single conformation.",
 +    "For usual Normal Modes-like calculations, make sure that",
 +    "the structure provided is properly energy-minimized.",
 +    "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]",
 +    "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
 +    "and distributes the topology over nodes if needed.",
 +    "[TT]mdrun[tt] produces at least four output files.",
 +    "A single log file ([TT]-g[tt]) is written, unless the option",
 +    "[TT]-seppot[tt] is used, in which case each node writes a log file.",
 +    "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
 +    "optionally forces.",
 +    "The structure file ([TT]-c[tt]) contains the coordinates and",
 +    "velocities of the last step.",
 +    "The energy file ([TT]-e[tt]) contains energies, the temperature,",
 +    "pressure, etc, a lot of these things are also printed in the log file.",
 +    "Optionally coordinates can be written to a compressed trajectory file",
 +    "([TT]-x[tt]).[PAR]",
 +    "The option [TT]-dhdl[tt] is only used when free energy calculation is",
 +    "turned on.[PAR]",
 +    "A simulation can be run in parallel using two different parallelization",
 +    "schemes: MPI parallelization and/or OpenMP thread parallelization.",
 +    "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is",
 +    "compiled with a normal MPI library or threads when [TT]mdrun[tt] is",
 +    "compiled with the GROMACS built-in thread-MPI library. OpenMP threads",
 +    "are supported when mdrun is compiled with OpenMP. Full OpenMP support",
 +    "is only available with the Verlet cut-off scheme, with the (older)",
 +    "group scheme only PME-only processes can use OpenMP parallelization.",
 +    "In all cases [TT]mdrun[tt] will by default try to use all the available",
 +    "hardware resources. With a normal MPI library only the options",
 +    "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],",
 +    "for PME-only processes, can be used to control the number of threads.",
 +    "With thread-MPI there are additional options [TT]-nt[tt], which sets",
 +    "the total number of threads, and [TT]-ntmpi[tt], which sets the number",
 +    "of thread-MPI threads.",
 +    "Note that using combined MPI+OpenMP parallelization is almost always",
 +    "slower than single parallelization, except at the scaling limit, where",
 +    "especially OpenMP parallelization of PME reduces the communication cost.",
 +    "OpenMP-only parallelization is much faster than MPI-only parallelization",
 +    "on a single CPU(-die). Since we currently don't have proper hardware",
 +    "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
 +    "automatically use OpenMP-only parallelization when you use up to 4",
 +    "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
 +    "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
 +    "parallelization is used (except with GPUs, see below).",
 +    "[PAR]",
 +    "To quickly test the performance of the new Verlet cut-off scheme",
 +    "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
 +    "the [TT]-testverlet[tt] option. This should not be used for production,",
 +    "since it can slightly modify potentials and it will remove charge groups",
 +    "making analysis difficult, as the [TT].tpr[tt] file will still contain",
 +    "charge groups. For production simulations it is highly recommended",
 +    "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.",
 +    "[PAR]",
 +    "With GPUs (only supported with the Verlet cut-off scheme), the number",
 +    "of GPUs should match the number of MPI processes or MPI threads,",
 +    "excluding PME-only processes/threads. With thread-MPI the number",
 +    "of MPI threads will automatically be set to the number of GPUs detected.",
 +    "When you want to use a subset of the available GPUs, you can use",
 +    "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,",
 +    "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's",
 +    "on different nodes of a compute cluster, use the GMX_GPU_ID environment",
 +    "variable instead. The format for GMX_GPU_ID is identical to ",
 +    "[TT]-gpu_id[tt], but an environment variable can have different values",
 +    "on different nodes of a cluster.",
 +    "[PAR]",
 +    "When using PME with separate PME nodes or with a GPU, the two major",
 +    "compute tasks, the non-bonded force calculation and the PME calculation",
 +    "run on different compute resources. If this load is not balanced,",
 +    "some of the resources will be idle part of time. With the Verlet",
 +    "cut-off scheme this load is automatically balanced when the PME load",
 +    "is too high (but not when it is too low). This is done by scaling",
 +    "the Coulomb cut-off and PME grid spacing by the same amount. In the first",
 +    "few hundred steps different settings are tried and the fastest is chosen",
 +    "for the rest of the simulation. This does not affect the accuracy of",
 +    "the results, but it does affect the decomposition of the Coulomb energy",
 +    "into particle and mesh contributions. The auto-tuning can be turned off",
 +    "with the option [TT]-notunepme[tt].",
 +    "[PAR]",
-     "When compiled with OpenMP on Linux, [TT]mdrun[tt] pins threads to cores,",
++    "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
++    "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
++    "even when no multi-threading is used,",
 +    "as this usually results in significantly better performance.",
-     "If you don't want this, use [TT]-nopin[tt].",
-     "With Intel CPUs with hyper-threading enabled, you should pin",
-     "consecutive threads to the same physical core for optimal",
-     "performance when you use virtual cores. This is done automatically",
-     "when you use more than half of the virtual cores. It can also be set",
-     "manually with [TT]-pinht[tt], e.g. for running multiple simulations",
-     "on one compute node.",
++    "If the queuing systems or the OpenMP library pinned threads, we honor",
++    "this and don't pin again, even though the layout may be sub-optimal.",
++    "If you want to have [TT]mdrun[tt] override an already set thread affinity",
++    "or pin threads when using less cores, use [TT]-pin on[tt].",
++    "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
++    "there are multiple logical cores per physical core.",
++    "The option [TT]-pinstride[tt] sets the stride in logical cores for",
++    "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
++    "With Intel Hyper-Threading 2 is best when using half or less of the",
++    "logical cores, 1 otherwise. The default value of 0 do exactly that:",
++    "it minimizes the threads per logical core, to optimize performance.",
++    "If you want to run multiple mdrun jobs on the same physical node,"
++    "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
 +    "When running multiple mdrun (or other) simulations on the same physical",
 +    "node, some simulations need to start pinning from a non-zero core",
 +    "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
-     "the offset in (physical) cores for pinning.",
++    "the offset in logical cores for pinning.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] is started using MPI with more than 1 process",
 +    "or with thread-MPI with more than 1 thread, MPI parallelization is used.",
 +    "By default domain decomposition is used, unless the [TT]-pd[tt]",
 +    "option is set, which selects particle decomposition.",
 +    "[PAR]",
 +    "With domain decomposition, the spatial decomposition can be set",
 +    "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
 +    "The user only needs to change this when the system is very inhomogeneous.",
 +    "Dynamic load balancing is set with the option [TT]-dlb[tt],",
 +    "which can give a significant performance improvement,",
 +    "especially for inhomogeneous systems. The only disadvantage of",
 +    "dynamic load balancing is that runs are no longer binary reproducible,",
 +    "but in most cases this is not important.",
 +    "By default the dynamic load balancing is automatically turned on",
 +    "when the measured performance loss due to load imbalance is 5% or more.",
 +    "At low parallelization these are the only important options",
 +    "for domain decomposition.",
 +    "At high parallelization the options in the next two sections",
 +    "could be important for increasing the performace.",
 +    "[PAR]",
 +    "When PME is used with domain decomposition, separate nodes can",
 +    "be assigned to do only the PME mesh calculation;",
 +    "this is computationally more efficient starting at about 12 nodes.",
 +    "The number of PME nodes is set with option [TT]-npme[tt],",
 +    "this can not be more than half of the nodes.",
 +    "By default [TT]mdrun[tt] makes a guess for the number of PME",
 +    "nodes when the number of nodes is larger than 11 or performance wise",
 +    "not compatible with the PME grid x dimension.",
 +    "But the user should optimize npme. Performance statistics on this issue",
 +    "are written at the end of the log file.",
 +    "For good load balancing at high parallelization, the PME grid x and y",
 +    "dimensions should be divisible by the number of PME nodes",
 +    "(the simulation will run correctly also when this is not the case).",
 +    "[PAR]",
 +    "This section lists all options that affect the domain decomposition.",
 +    "[PAR]",
 +    "Option [TT]-rdd[tt] can be used to set the required maximum distance",
 +    "for inter charge-group bonded interactions.",
 +    "Communication for two-body bonded interactions below the non-bonded",
 +    "cut-off distance always comes for free with the non-bonded communication.",
 +    "Atoms beyond the non-bonded cut-off are only communicated when they have",
 +    "missing bonded interactions; this means that the extra cost is minor",
 +    "and nearly indepedent of the value of [TT]-rdd[tt].",
 +    "With dynamic load balancing option [TT]-rdd[tt] also sets",
 +    "the lower limit for the domain decomposition cell sizes.",
 +    "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
 +    "the initial coordinates. The chosen value will be a balance",
 +    "between interaction range and communication cost.",
 +    "[PAR]",
 +    "When inter charge-group bonded interactions are beyond",
 +    "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
 +    "For pair interactions and tabulated bonds",
 +    "that do not generate exclusions, this check can be turned off",
 +    "with the option [TT]-noddcheck[tt].",
 +    "[PAR]",
 +    "When constraints are present, option [TT]-rcon[tt] influences",
 +    "the cell size limit as well.",
 +    "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
 +    "should not be beyond the smallest cell size. A error message is",
 +    "generated when this happens and the user should change the decomposition",
 +    "or decrease the LINCS order and increase the number of LINCS iterations.",
 +    "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
 +    "in a conservative fashion. For high parallelization it can be useful",
 +    "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
 +    "[PAR]",
 +    "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
 +    "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
 +    "the cells can scale down by at least this factor. This option is used",
 +    "for the automated spatial decomposition (when not using [TT]-dd[tt])",
 +    "as well as for determining the number of grid pulses, which in turn",
 +    "sets the minimum allowed cell size. Under certain circumstances",
 +    "the value of [TT]-dds[tt] might need to be adjusted to account for",
 +    "high or low spatial inhomogeneity of the system.",
 +    "[PAR]",
 +    "The option [TT]-gcom[tt] can be used to only do global communication",
 +    "every n steps.",
 +    "This can improve performance for highly parallel simulations",
 +    "where this global communication step becomes the bottleneck.",
 +    "For a global thermostat and/or barostat the temperature",
 +    "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
 +    "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
 +    "With [TT]-rerun[tt] an input trajectory can be given for which ",
 +    "forces and energies will be (re)calculated. Neighbor searching will be",
 +    "performed for every frame, unless [TT]nstlist[tt] is zero",
 +    "(see the [TT].mdp[tt] file).[PAR]",
 +    "ED (essential dynamics) sampling and/or additional flooding potentials",
 +    "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]",
 +    "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool",
 +    "or by using options in the essdyn menu of the WHAT IF program.",
 +    "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that",
 +    "contains projections of positions, velocities and forces onto selected",
 +    "eigenvectors.[PAR]",
 +    "When user-defined potential functions have been selected in the",
 +    "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
 +    "a formatted table with potential functions. The file is read from",
 +    "either the current directory or from the [TT]GMXLIB[tt] directory.",
 +    "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
 +    "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
 +    "normal Coulomb.",
 +    "When pair interactions are present, a separate table for pair interaction",
 +    "functions is read using the [TT]-tablep[tt] option.[PAR]",
 +    "When tabulated bonded functions are present in the topology,",
 +    "interaction functions are read using the [TT]-tableb[tt] option.",
 +    "For each different tabulated interaction type the table file name is",
 +    "modified in a different way: before the file extension an underscore is",
 +    "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
 +    "and finally the table number of the interaction type.[PAR]",
 +    "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
 +    "coordinates and forces when pulling is selected",
 +    "in the [TT].mdp[tt] file.[PAR]",
 +    "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
 +    "simulated in parallel.",
 +    "As many input files/directories are required as the number of systems. ",
 +    "The [TT]-multidir[tt] option takes a list of directories (one for each ",
 +    "system) and runs in each of them, using the input/output file names, ",
 +    "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
 +    "directories.",
 +    "With [TT]-multi[tt], the system number is appended to the run input ",
 +    "and each output filename, for instance [TT]topol.tpr[tt] becomes",
 +    "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
 +    "The number of nodes per system is the total number of nodes",
 +    "divided by the number of systems.",
 +    "One use of this option is for NMR refinement: when distance",
 +    "or orientation restraints are present these can be ensemble averaged",
 +    "over all the systems.[PAR]",
 +    "With [TT]-replex[tt] replica exchange is attempted every given number",
 +    "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
 +    "[TT]-multidir[tt] option, described above.",
 +    "All run input files should use a different coupling temperature,",
 +    "the order of the files is not important. The random seed is set with",
 +    "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
 +    "is performed after every exchange.[PAR]",
 +    "Finally some experimental algorithms can be tested when the",
 +    "appropriate options have been given. Currently under",
 +    "investigation are: polarizability and X-ray bombardments.",
 +    "[PAR]",
 +    "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed",
 +    "a protein into a membrane. The data file should contain the options",
 +    "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
 +    "both apply to this as well.",
 +    "[PAR]",
 +    "The option [TT]-pforce[tt] is useful when you suspect a simulation",
 +    "crashes due to too large forces. With this option coordinates and",
 +    "forces of atoms with a force larger than a certain value will",
 +    "be printed to stderr.",
 +    "[PAR]",
 +    "Checkpoints containing the complete state of the system are written",
 +    "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
 +    "unless option [TT]-cpt[tt] is set to -1.",
 +    "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
 +    "make sure that a recent state of the system is always available,",
 +    "even when the simulation is terminated while writing a checkpoint.",
 +    "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
 +    "with the step number.",
 +    "A simulation can be continued by reading the full state from file",
 +    "with option [TT]-cpi[tt]. This option is intelligent in the way that",
 +    "if no checkpoint file is found, Gromacs just assumes a normal run and",
 +    "starts from the first step of the [TT].tpr[tt] file. By default the output",
 +    "will be appending to the existing output files. The checkpoint file",
 +    "contains checksums of all output files, such that you will never",
 +    "loose data when some output files are modified, corrupt or removed.",
 +    "There are three scenarios with [TT]-cpi[tt]:[PAR]",
 +    "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
 +    "[TT]*[tt] all files are present with names and checksums matching those stored",
 +    "in the checkpoint file: files are appended[PAR]",
 +    "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
 +    "With [TT]-noappend[tt] new output files are opened and the simulation",
 +    "part number is added to all output file names.",
 +    "Note that in all cases the checkpoint file itself is not renamed",
 +    "and will be overwritten, unless its name does not match",
 +    "the [TT]-cpo[tt] option.",
 +    "[PAR]",
 +    "With checkpointing the output is appended to previously written",
 +    "output files, unless [TT]-noappend[tt] is used or none of the previous",
 +    "output files are present (except for the checkpoint file).",
 +    "The integrity of the files to be appended is verified using checksums",
 +    "which are stored in the checkpoint file. This ensures that output can",
 +    "not be mixed up or corrupted due to file appending. When only some",
 +    "of the previous output files are present, a fatal error is generated",
 +    "and no old output files are modified and no new output files are opened.",
 +    "The result with appending will be the same as from a single run.",
 +    "The contents will be binary identical, unless you use a different number",
 +    "of nodes or dynamic load balancing or the FFT library uses optimizations",
 +    "through timing.",
 +    "[PAR]",
 +    "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
 +    "file is written at the first neighbor search step where the run time",
 +    "exceeds [TT]-maxh[tt]*0.99 hours.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
 +    "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
 +    "pressed), it will stop after the next neighbor search step ",
 +    "(with nstlist=0 at the next step).",
 +    "In both cases all the usual output will be written to file.",
 +    "When running with MPI, a signal to one of the [TT]mdrun[tt] processes",
 +    "is sufficient, this signal should not be sent to mpirun or",
 +    "the [TT]mdrun[tt] process that is the parent of the others.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
 +  };
 +  t_commrec    *cr;
 +  t_filenm fnm[] = {
 +    { efTPX, NULL,      NULL,       ffREAD },
 +    { efTRN, "-o",      NULL,       ffWRITE },
 +    { efXTC, "-x",      NULL,       ffOPTWR },
 +    { efCPT, "-cpi",    NULL,       ffOPTRD },
 +    { efCPT, "-cpo",    NULL,       ffOPTWR },
 +    { efSTO, "-c",      "confout",  ffWRITE },
 +    { efEDR, "-e",      "ener",     ffWRITE },
 +    { efLOG, "-g",      "md",       ffWRITE },
 +    { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
 +    { efXVG, "-field",  "field",    ffOPTWR },
 +    { efXVG, "-table",  "table",    ffOPTRD },
 +    { efXVG, "-tabletf", "tabletf",    ffOPTRD },
 +    { efXVG, "-tablep", "tablep",   ffOPTRD },
 +    { efXVG, "-tableb", "table",    ffOPTRD },
 +    { efTRX, "-rerun",  "rerun",    ffOPTRD },
 +    { efXVG, "-tpi",    "tpi",      ffOPTWR },
 +    { efXVG, "-tpid",   "tpidist",  ffOPTWR },
 +    { efEDI, "-ei",     "sam",      ffOPTRD },
 +    { efXVG, "-eo",     "edsam",    ffOPTWR },
 +    { efGCT, "-j",      "wham",     ffOPTRD },
 +    { efGCT, "-jo",     "bam",      ffOPTWR },
 +    { efXVG, "-ffout",  "gct",      ffOPTWR },
 +    { efXVG, "-devout", "deviatie", ffOPTWR },
 +    { efXVG, "-runav",  "runaver",  ffOPTWR },
 +    { efXVG, "-px",     "pullx",    ffOPTWR },
 +    { efXVG, "-pf",     "pullf",    ffOPTWR },
 +    { efXVG, "-ro",     "rotation", ffOPTWR },
 +    { efLOG, "-ra",     "rotangles",ffOPTWR },
 +    { efLOG, "-rs",     "rotslabs", ffOPTWR },
 +    { efLOG, "-rt",     "rottorque",ffOPTWR },
 +    { efMTX, "-mtx",    "nm",       ffOPTWR },
 +    { efNDX, "-dn",     "dipole",   ffOPTWR },
 +    { efRND, "-multidir",NULL,      ffOPTRDMULT},
 +    { efDAT, "-membed", "membed",   ffOPTRD },
 +    { efTOP, "-mp",     "membed",   ffOPTRD },
 +    { efNDX, "-mn",     "membed",   ffOPTRD }
 +  };
 +#define NFILE asize(fnm)
 +
 +  /* Command line options ! */
 +  gmx_bool bCart        = FALSE;
 +  gmx_bool bPPPME       = FALSE;
 +  gmx_bool bPartDec     = FALSE;
 +  gmx_bool bDDBondCheck = TRUE;
 +  gmx_bool bDDBondComm  = TRUE;
 +  gmx_bool bTunePME     = TRUE;
 +  gmx_bool bTestVerlet  = FALSE;
 +  gmx_bool bVerbose     = FALSE;
 +  gmx_bool bCompact     = TRUE;
 +  gmx_bool bSepPot      = FALSE;
 +  gmx_bool bRerunVSite  = FALSE;
 +  gmx_bool bIonize      = FALSE;
 +  gmx_bool bConfout     = TRUE;
 +  gmx_bool bReproducible = FALSE;
 +    
 +  int  npme=-1;
 +  int  nmultisim=0;
 +  int  nstglobalcomm=-1;
 +  int  repl_ex_nst=0;
 +  int  repl_ex_seed=-1;
 +  int  repl_ex_nex=0;
 +  int  nstepout=100;
 +  int  resetstep=-1;
 +  int  nsteps=-2; /* the value -2 means that the mdp option will be used */
 +  
 +  rvec realddxyz={0,0,0};
 +  const char *ddno_opt[ddnoNR+1] =
 +    { NULL, "interleave", "pp_pme", "cartesian", NULL };
 +  const char *dddlb_opt[] =
 +    { NULL, "auto", "no", "yes", NULL };
++  const char *thread_aff_opt[threadaffNR+1] =
++    { NULL, "auto", "on", "off", NULL };
 +  const char *nbpu_opt[] =
 +    { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
 +  real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1;
 +  char *ddcsx=NULL,*ddcsy=NULL,*ddcsz=NULL;
 +  real cpt_period=15.0,max_hours=-1;
 +  gmx_bool bAppendFiles=TRUE;
 +  gmx_bool bKeepAndNumCPT=FALSE;
 +  gmx_bool bResetCountersHalfWay=FALSE;
 +  output_env_t oenv=NULL;
 +  const char *deviceOptions = "";
 +
-   gmx_hw_opt_t hw_opt={0,0,0,0,TRUE,FALSE,0,NULL};
++  gmx_hw_opt_t hw_opt={0,0,0,0,threadaffSEL,0,0,NULL};
 +
 +  t_pargs pa[] = {
 +
 +    { "-pd",      FALSE, etBOOL,{&bPartDec},
 +      "Use particle decompostion" },
 +    { "-dd",      FALSE, etRVEC,{&realddxyz},
 +      "Domain decomposition grid, 0 is optimize" },
 +    { "-ddorder", FALSE, etENUM, {ddno_opt},
 +      "DD node order" },
 +    { "-npme",    FALSE, etINT, {&npme},
 +      "Number of separate nodes to be used for PME, -1 is guess" },
 +    { "-nt",      FALSE, etINT, {&hw_opt.nthreads_tot},
 +      "Total number of threads to start (0 is guess)" },
 +    { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
 +      "Number of thread-MPI threads to start (0 is guess)" },
 +    { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
 +      "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
 +    { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
 +      "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
-     { "-pin",     FALSE, etBOOL, {&hw_opt.bThreadPinning},
-       "Pin OpenMP threads to cores" },
-     { "-pinht",   FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
-       "Always pin threads to Hyper-Threading cores" },
++    { "-pin",     FALSE, etENUM, {thread_aff_opt},
++      "Fix threads (or processes) to specific cores" },
 +    { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
-       "Core offset for pinning (for running multiple mdrun processes on a single physical node)" },
++      "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
++    { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
++      "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
 +    { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
 +      "List of GPU id's to use" },
 +    { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
 +      "Check for all bonded interactions with DD" },
 +    { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
 +      "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
 +    { "-rdd",     FALSE, etREAL, {&rdd},
 +      "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
 +    { "-rcon",    FALSE, etREAL, {&rconstr},
 +      "Maximum distance for P-LINCS (nm), 0 is estimate" },
 +    { "-dlb",     FALSE, etENUM, {dddlb_opt},
 +      "Dynamic load balancing (with DD)" },
 +    { "-dds",     FALSE, etREAL, {&dlb_scale},
 +      "Minimum allowed dlb scaling of the DD cell size" },
 +    { "-ddcsx",   FALSE, etSTR, {&ddcsx},
 +      "HIDDENThe DD cell sizes in x" },
 +    { "-ddcsy",   FALSE, etSTR, {&ddcsy},
 +      "HIDDENThe DD cell sizes in y" },
 +    { "-ddcsz",   FALSE, etSTR, {&ddcsz},
 +      "HIDDENThe DD cell sizes in z" },
 +    { "-gcom",    FALSE, etINT,{&nstglobalcomm},
 +      "Global communication frequency" },
 +    { "-nb",      FALSE, etENUM, {&nbpu_opt},
 +      "Calculate non-bonded interactions on" },
 +    { "-tunepme", FALSE, etBOOL, {&bTunePME},  
 +      "Optimize PME load between PP/PME nodes or GPU/CPU" },
 +    { "-testverlet", FALSE, etBOOL, {&bTestVerlet},
 +      "Test the Verlet non-bonded scheme" },
 +    { "-v",       FALSE, etBOOL,{&bVerbose},  
 +      "Be loud and noisy" },
 +    { "-compact", FALSE, etBOOL,{&bCompact},  
 +      "Write a compact log file" },
 +    { "-seppot",  FALSE, etBOOL, {&bSepPot},
 +      "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
 +    { "-pforce",  FALSE, etREAL, {&pforce},
 +      "Print all forces larger than this (kJ/mol nm)" },
 +    { "-reprod",  FALSE, etBOOL,{&bReproducible},  
 +      "Try to avoid optimizations that affect binary reproducibility" },
 +    { "-cpt",     FALSE, etREAL, {&cpt_period},
 +      "Checkpoint interval (minutes)" },
 +    { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
 +      "Keep and number checkpoint files" },
 +    { "-append",  FALSE, etBOOL, {&bAppendFiles},
 +      "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
 +    { "-nsteps",  FALSE, etINT, {&nsteps},
 +      "Run this number of steps, overrides .mdp file option" },
 +    { "-maxh",   FALSE, etREAL, {&max_hours},
 +      "Terminate after 0.99 times this time (hours)" },
 +    { "-multi",   FALSE, etINT,{&nmultisim}, 
 +      "Do multiple simulations in parallel" },
 +    { "-replex",  FALSE, etINT, {&repl_ex_nst}, 
 +      "Attempt replica exchange periodically with this period (steps)" },
 +    { "-nex",  FALSE, etINT, {&repl_ex_nex},
 +      "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion).  -nex zero or not specified gives neighbor replica exchange." },
 +    { "-reseed",  FALSE, etINT, {&repl_ex_seed}, 
 +      "Seed for replica exchange, -1 is generate a seed" },
 +    { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
 +      "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
 +    { "-ionize",  FALSE, etBOOL,{&bIonize},
 +      "Do a simulation including the effect of an X-Ray bombardment on your system" },
 +    { "-confout", FALSE, etBOOL, {&bConfout},
 +      "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
 +    { "-stepout", FALSE, etINT, {&nstepout},
 +      "HIDDENFrequency of writing the remaining runtime" },
 +    { "-resetstep", FALSE, etINT, {&resetstep},
 +      "HIDDENReset cycle counters after these many time steps" },
 +    { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
 +      "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
 +  };
 +  gmx_edsam_t  ed;
 +  unsigned long Flags, PCA_Flags;
 +  ivec     ddxyz;
 +  int      dd_node_order;
 +  gmx_bool     bAddPart;
 +  FILE     *fplog,*fpmulti;
 +  int      sim_part,sim_part_fn;
 +  const char *part_suffix=".part";
 +  char     suffix[STRLEN];
 +  int      rc;
 +  char **multidir=NULL;
 +
 +
 +  cr = init_par(&argc,&argv);
 +
 +  if (MASTER(cr))
 +    CopyRight(stderr, argv[0]);
 +
 +  PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
 +  
 +  /* Comment this in to do fexist calls only on master
 +   * works not with rerun or tables at the moment
 +   * also comment out the version of init_forcerec in md.c 
 +   * with NULL instead of opt2fn
 +   */
 +  /*
 +     if (!MASTER(cr))
 +     {
 +     PCA_Flags |= PCA_NOT_READ_NODE;
 +     }
 +     */
 +
 +  parse_common_args(&argc,argv,PCA_Flags, NFILE,fnm,asize(pa),pa,
 +                    asize(desc),desc,0,NULL, &oenv);
 +
 +
- 
 +  /* we set these early because they might be used in init_multisystem() 
 +     Note that there is the potential for npme>nnodes until the number of
 +     threads is set later on, if there's thread parallelization. That shouldn't
 +     lead to problems. */ 
 +  dd_node_order = nenum(ddno_opt);
 +  cr->npmenodes = npme;
 +
++  hw_opt.thread_affinity = nenum(thread_aff_opt);
++
 +  /* now check the -multi and -multidir option */
 +  if (opt2bSet("-multidir", NFILE, fnm))
 +  {
 +      int i;
 +      if (nmultisim > 0)
 +      {
 +          gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
 +      }
 +      nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
 +  }
 +
 +
 +  if (repl_ex_nst != 0 && nmultisim < 2)
 +      gmx_fatal(FARGS,"Need at least two replicas for replica exchange (option -multi)");
 +
 +  if (repl_ex_nex < 0)
 +      gmx_fatal(FARGS,"Replica exchange number of exchanges needs to be positive");
 +
 +  if (nmultisim > 1) {
 +#ifndef GMX_THREAD_MPI
 +    gmx_bool bParFn = (multidir == NULL);
 +    init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 +#else
 +    gmx_fatal(FARGS,"mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support");
 +#endif
 +  }
 +
 +  bAddPart = !bAppendFiles;
 +
 +  /* Check if there is ANY checkpoint file available */	
 +  sim_part    = 1;
 +  sim_part_fn = sim_part;
 +  if (opt2bSet("-cpi",NFILE,fnm))
 +  {
 +      if (bSepPot && bAppendFiles)
 +      {
 +          gmx_fatal(FARGS,"Output file appending is not supported with -seppot");
 +      }
 +
 +      bAppendFiles =
 +                read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
 +                                                              fnm,cr),
 +                                                &sim_part_fn,NULL,cr,
 +                                                bAppendFiles,NFILE,fnm,
 +                                                part_suffix,&bAddPart);
 +      if (sim_part_fn==0 && MULTIMASTER(cr))
 +      {
 +          fprintf(stdout,"No previous checkpoint file present, assuming this is a new run.\n");
 +      }
 +      else
 +      {
 +          sim_part = sim_part_fn + 1;
 +      }
 +
 +      if (MULTISIM(cr) && MASTER(cr))
 +      {
 +          if (MULTIMASTER(cr))
 +          {
 +              /* Log file is not yet available, so if there's a
 +               * problem we can only write to stderr. */
 +              fpmulti = stderr;
 +          }
 +          else
 +          {
 +              fpmulti = NULL;
 +          }
 +          check_multi_int(fpmulti,cr->ms,sim_part,"simulation part",TRUE);
 +      }
 +  } 
 +  else
 +  {
 +      bAppendFiles = FALSE;
 +  }
 +
 +  if (!bAppendFiles)
 +  {
 +      sim_part_fn = sim_part;
 +  }
 +
 +  if (bAddPart)
 +  {
 +      /* Rename all output files (except checkpoint files) */
 +      /* create new part name first (zero-filled) */
 +      sprintf(suffix,"%s%04d",part_suffix,sim_part_fn);
 +
 +      add_suffix_to_output_names(fnm,NFILE,suffix);
 +      if (MULTIMASTER(cr))
 +      {
 +          fprintf(stdout,"Checkpoint file is from part %d, new output files will be suffixed '%s'.\n",sim_part-1,suffix);
 +      }
 +  }
 +
 +  Flags = opt2bSet("-rerun",NFILE,fnm) ? MD_RERUN : 0;
 +  Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
 +  Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
 +  Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
 +  Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
 +  Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
 +  Flags = Flags | (bTunePME      ? MD_TUNEPME      : 0);
 +  Flags = Flags | (bTestVerlet   ? MD_TESTVERLET   : 0);
 +  Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
 +  Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
 +  Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
 +  Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0); 
 +  Flags = Flags | (opt2parg_bSet("-append", asize(pa),pa) ? MD_APPENDFILESSET : 0); 
 +  Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0); 
 +  Flags = Flags | (sim_part>1    ? MD_STARTFROMCPT : 0); 
 +  Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
 +
 +
 +  /* We postpone opening the log file if we are appending, so we can 
 +     first truncate the old log file and append to the correct position 
 +     there instead.  */
 +  if ((MASTER(cr) || bSepPot) && !bAppendFiles) 
 +  {
 +      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,
 +                   !bSepPot,Flags & MD_APPENDFILES,&fplog);
 +      CopyRight(fplog,argv[0]);
 +      please_cite(fplog,"Hess2008b");
 +      please_cite(fplog,"Spoel2005a");
 +      please_cite(fplog,"Lindahl2001a");
 +      please_cite(fplog,"Berendsen95a");
 +  }
 +  else if (!MASTER(cr) && bSepPot)
 +  {
 +      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog);
 +  }
 +  else
 +  {
 +      fplog = NULL;
 +  }
 +
 +  ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
 +  ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
 +  ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
 +
 +  rc = mdrunner(&hw_opt, fplog,cr,NFILE,fnm,oenv,bVerbose,bCompact,
 +                nstglobalcomm, ddxyz,dd_node_order,rdd,rconstr,
 +                dddlb_opt[0],dlb_scale,ddcsx,ddcsy,ddcsz,
 +                nbpu_opt[0],
 +                nsteps,nstepout,resetstep,
 +                nmultisim,repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                pforce, cpt_period,max_hours,deviceOptions,Flags);
 +
 +  gmx_finalize_par();
 +
 +  if (MULTIMASTER(cr)) {
 +      thanx(stderr);
 +  }
 +
 +  /* Log file has to be closed in mdrunner if we are appending to it 
 +     (fplog not set here) */
 +  if (MASTER(cr) && !bAppendFiles) 
 +  {
 +      gmx_log_close(fplog);
 +  }
 +
 +  return rc;
 +}
 +
diff --cc src/programs/mdrun/runner.c
index 40b48cd691,0000000000..6b712de3d3
mode 100644,000000..100644
--- a/src/programs/mdrun/runner.c
+++ b/src/programs/mdrun/runner.c
@@@ -1,1990 -1,0 +1,1683 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
- #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
- #define _GNU_SOURCE
- #include <sched.h>
- #include <sys/syscall.h>
- #endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "../mdlib/nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
- 
- #include "thread_mpi/threads.h"
++#include "gmx_thread_affinity.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t *hw_opt;
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    const char *nbpu_opt;
 +    int nsteps_cmdline;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_nex;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                      mc.nbpu_opt,
 +                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              const char *nbpu_opt,
 +              int nsteps_cmdline, int nstepout,int resetstep,
 +              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
 +              real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt=hw_opt;
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nbpu_opt=nbpu_opt;
 +    mda->nsteps_cmdline=nsteps_cmdline;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_nex=repl_ex_nex;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
-        the main thread returns */
-     ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
-                      (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
++       the main thread returns, we set thread affinity later */
++    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
 +                     mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 +                                        const gmx_hw_opt_t *hw_opt,
 +                                        int nthreads_tot,
 +                                        int ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        /* Here we could oversubscribe, when we do, we issue a warning later */
 +        nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* In general, when running up to 4 threads, OpenMP should be faster.
 +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 +         * even on two CPUs it's usually faster (but with many OpenMP threads
 +         * it could be faster not to use HT, currently we always use HT).
 +         * On Nehalem/Westmere we want to avoid running 16 threads over
 +         * two CPUs with HT, so we need a limit<16; thus we use 12.
 +         * A reasonable limit for Intel Sandy and Ivy bridge,
 +         * not knowing the topology, is 16 threads.
 +         */
 +        const int nthreads_omp_always_faster             =  4;
 +        const int nthreads_omp_always_faster_Nehalem     = 12;
 +        const int nthreads_omp_always_faster_SandyBridge = 16;
 +        const int first_model_Nehalem     = 0x1A;
 +        const int first_model_SandyBridge = 0x2A;
 +        gmx_bool bIntel_Family6;
 +
 +        bIntel_Family6 =
 +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
 +
 +        if (nthreads_tot <= nthreads_omp_always_faster ||
 +            (bIntel_Family6 &&
 +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
 +        {
 +            /* Use pure OpenMP parallelization */
 +            nthreads_tmpi = 1;
 +        }
 +        else
 +        {
 +            /* Don't use OpenMP parallelization */
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
 +    int min_atoms_per_mpi_thread;
 +    char *env;
 +    char sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    nthreads_hw = hwinfo->nthreads_hw_avail;
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = nthreads_hw;
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        /* Avoid partial use of Hyper-Threading */
 +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 +        {
 +            nthreads_new = nthreads_hw/2;
 +        }
 +
 +        /* Avoid large prime numbers in the thread count */
 +        if (nthreads_new >= 6)
 +        {
 +            /* Use only 6,8,10 with additional factors of 2 */
 +            int fac;
 +
 +            fac = 2;
 +            while (3*fac*2 <= nthreads_new)
 +            {
 +                fac *= 2;
 +            }
 +
 +            nthreads_new = (nthreads_new/fac)*fac;
 +        }
 +        else
 +        {
 +            /* Avoid 5 */
 +            if (nthreads_new == 5)
 +            {
 +                nthreads_new = 4;
 +            }
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
 +        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp,t_commrec *cr,
 +                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
 +{
 +    char *env;
 +    int  nstlist_orig,nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
 +    int  i;
 +    t_state state_tmp;
 +    gmx_bool bBox,bDD,bCont;
 +    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
 +    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[]={ 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,nstl_fmt,ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",buf);
 +        }
 +        sscanf(env,"%d",&ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE,&ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc,rlist_max);
 +    }
 +
 +    i = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box,state_tmp.box);
 +            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox = TRUE;
 +                bDD  = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig,ir->nstlist,
 +                ir->rlist,rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n\n",buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE *fplog,
 +                                  gmx_hw_info_t *hwinfo,
 +                                  t_commrec *cr,
 +                                  gmx_hw_opt_t *hw_opt,
 +                                  const char *nbpu_opt,
 +                                  t_inputrec *ir,
 +                                  const gmx_mtop_t *mtop,
 +                                  matrix box,
 +                                  gmx_bool *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU,&ls);
 +
 +        calc_verlet_buffer_size(mtop,det(box),ir,
 +                                ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist,rlist_new,
 +                        ls.cluster_size_i,ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog,cr,ir,mtop,box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop,real box_vol)
 +{
 +    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE,&ls);
 +        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
 +                                NULL,&ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
- /* Check the process affinity mask. If it is non-zero, something
-  * else has set the affinity, and mdrun should honor that and
-  * not attempt to do its own thread pinning.
-  *
-  * This function should be called twice. Once before the OpenMP
-  * library gets initialized with bAfterOpenMPInit=FALSE (which will
-  * detect affinity set by external tools like taskset), and again
-  * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
-  * (which will detect affinity changes made by the OpenMP library).
-  *
-  * Note that this will only work on Linux, because we use a GNU
-  * feature. */
- static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
-                                    gmx_hw_opt_t *hw_opt, int ncpus,
-                                    gmx_bool bAfterOpenmpInit)
- {
- #ifdef HAVE_SCHED_GETAFFINITY
-     cpu_set_t mask_current;
-     int       i, ret, cpu_count, cpu_set;
-     gmx_bool  bAllSet;
- 
-     assert(hw_opt);
-     if (!hw_opt->bThreadPinning)
-     {
-         /* internal affinity setting is off, don't bother checking process affinity */
-         return;
-     }
- 
-     CPU_ZERO(&mask_current);
-     if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
-     {
-         /* failed to query affinity mask, will just return */
-         if (debug)
-         {
-             fprintf(debug, "Failed to query affinity mask (error %d)", ret);
-         }
-         return;
-     }
- 
-     /* Before proceeding with the actual check, make sure that the number of
-      * detected CPUs is >= the CPUs in the current set.
-      * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
- #ifdef CPU_COUNT
-     if (ncpus < CPU_COUNT(&mask_current))
-     {
-         if (debug)
-         {
-             fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
-                     ncpus, CPU_COUNT(&mask_current));
-         }
-         return;
-     }
- #endif /* CPU_COUNT */
- 
-     bAllSet = TRUE;
-     for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
-     {
-         bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
-     }
- 
-     if (!bAllSet)
-     {
-         if (!bAfterOpenmpInit)
-         {
-             md_print_warn(cr, fplog,
-                           "%s detected a non-default process affinity, "
-                           "so it will not attempt to pin its threads", ShortProgram());
-         }
-         else
-         {
-             md_print_warn(cr, fplog,
-                           "%s detected a non-default process affinity, "
-                           "probably set by the OpenMP library, "
-                           "so it will not attempt to pin its threads", ShortProgram());
-         }
-         hw_opt->bThreadPinning = FALSE;
- 
-         if (debug)
-         {
-             fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
-         }
-     }
-     else
-     {
-         if (debug)
-         {
-             fprintf(debug, "Default affinity mask found\n");
-         }
-     }
- #endif /* HAVE_SCHED_GETAFFINITY */
- }
- 
- /* Set CPU affinity. Can be important for performance.
-    On some systems (e.g. Cray) CPU Affinity is set by default.
-    But default assigning doesn't work (well) with only some ranks
-    having threads. This causes very low performance.
-    External tools have cumbersome syntax for setting affinity
-    in the case that only some ranks have threads.
-    Thus it is important that GROMACS sets the affinity internally
-    if only PME is using threads.
- */
- static void set_cpu_affinity(FILE *fplog,
-                              const t_commrec *cr,
-                              gmx_hw_opt_t *hw_opt,
-                              int nthreads_pme,
-                              const gmx_hw_info_t *hwinfo,
-                              const t_inputrec *inputrec)
- {
- #if defined GMX_THREAD_MPI
-     /* With the number of TMPI threads equal to the number of cores
-      * we already pinned in thread-MPI, so don't pin again here.
-      */
-     if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
-     {
-         return;
-     }
- #endif
- 
- #ifndef __APPLE__
-     /* If the tMPI thread affinity setting is not supported encourage the user
-      * to report it as it's either a bug or an exotic platform which we might
-      * want to support. */
-     if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
-     {
-         md_print_warn(NULL, fplog,
-                       "Can not set thread affinities on the current plarform. On NUMA systems this\n"
-                       "can cause performance degradation. If you think your platform should support\n"
-                       "setting affinities, contact the GROMACS developers.");
-         return;
-     }
- #endif /* __APPLE__ */
- 
-     if (hw_opt->bThreadPinning)
-     {
-         int nth_affinity_set, thread_id_node, thread_id,
-             nthread_local, nthread_node, nthread_hw_max, nphyscore;
-         int offset;
-         char *env;
- 
-         /* threads on this MPI process or TMPI thread */
-         if (cr->duty & DUTY_PP)
-         {
-             nthread_local = gmx_omp_nthreads_get(emntNonbonded);
-         }
-         else
-         {
-             nthread_local = gmx_omp_nthreads_get(emntPME);
-         }
- 
-         /* map the current process to cores */
-         thread_id_node = 0;
-         nthread_node = nthread_local;
- #ifdef GMX_MPI
-         if (PAR(cr) || MULTISIM(cr))
-         {
-             /* We need to determine a scan of the thread counts in this
-              * compute node.
-              */
-             MPI_Comm comm_intra;
- 
-             MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
-                            &comm_intra);
-             MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
-             /* MPI_Scan is inclusive, but here we need exclusive */
-             thread_id_node -= nthread_local;
-             /* Get the total number of threads on this physical node */
-             MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
-             MPI_Comm_free(&comm_intra);
-         }
- #endif
- 
-         offset = 0;
-         if (hw_opt->core_pinning_offset > 0)
-         {
-             offset = hw_opt->core_pinning_offset;
-             if (SIMMASTER(cr))
-             {
-                 fprintf(stderr, "Applying core pinning offset %d\n", offset);
-             }
-             if (fplog)
-             {
-                 fprintf(fplog, "Applying core pinning offset %d\n", offset);
-             }
-         }
- 
-         /* With Intel Hyper-Threading enabled, we want to pin consecutive
-          * threads to physical cores when using more threads than physical
-          * cores or when the user requests so.
-          */
-         nthread_hw_max = hwinfo->nthreads_hw_avail;
-         nphyscore = -1;
-         if (hw_opt->bPinHyperthreading ||
-             (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
-              nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
-         {
-             if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
-             {
-                 /* We print to stderr on all processes, as we might have
-                  * different settings on different physical nodes.
-                  */
-                 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
-                 {
-                     md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
-                                   "but non-Intel CPU detected (vendor: %s)\n",
-                                   gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
-                 }
-                 else
-                 {
-                     md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
-                                   "but the CPU detected does not have Intel Hyper-Threading support "
-                                   "(or it is turned off)\n");
-                 }
-             }
-             nphyscore = nthread_hw_max/2;
- 
-             if (SIMMASTER(cr))
-             {
-                 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
-                         nphyscore);
-             }
-             if (fplog)
-             {
-                 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
-                         nphyscore);
-             }
-         }
- 
-         /* Set the per-thread affinity. In order to be able to check the success
-          * of affinity settings, we will set nth_affinity_set to 1 on threads
-          * where the affinity setting succeded and to 0 where it failed.
-          * Reducing these 0/1 values over the threads will give the total number
-          * of threads on which we succeeded.
-          */
-          nth_affinity_set = 0;
- #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
-                      reduction(+:nth_affinity_set)
-         {
-             int      core;
-             gmx_bool setaffinity_ret;
- 
-             thread_id       = gmx_omp_get_thread_num();
-             thread_id_node += thread_id;
-             if (nphyscore <= 0)
-             {
-                 core = offset + thread_id_node;
-             }
-             else
-             {
-                 /* Lock pairs of threads to the same hyperthreaded core */
-                 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
-             }
- 
-             setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
- 
-             /* store the per-thread success-values of the setaffinity */
-             nth_affinity_set = (setaffinity_ret == 0);
- 
-             if (debug)
-             {
-                 fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
-                         cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
-             }
-         }
- 
-         if (nth_affinity_set > nthread_local)
-         {
-             char msg[STRLEN];
- 
-             sprintf(msg, "Looks like we have set affinity for more threads than "
-                     "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
-             gmx_incons(msg);
-         }
-         else
-         {
-             /* check & warn if some threads failed to set their affinities */
-             if (nth_affinity_set != nthread_local)
-             {
-                 char sbuf1[STRLEN], sbuf2[STRLEN];
- 
-                 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
-                 sbuf1[0] = sbuf2[0] = '\0';
- #ifdef GMX_MPI
- #ifdef GMX_THREAD_MPI
-                 sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
- #else /* GMX_LIB_MPI */
-                 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
- #endif
- #endif /* GMX_MPI */
- 
-                 if (nthread_local > 1)
-                 {
-                     sprintf(sbuf2, "of %d/%d thread%s ",
-                             nthread_local - nth_affinity_set, nthread_local,
-                             (nthread_local - nth_affinity_set) > 1 ? "s" : "");
-                 }
- 
-                 md_print_warn(NULL, fplog,
-                               "NOTE: %sAffinity setting %sfailed.\n"
-                               "      This can cause performance degradation!",
-                               sbuf1, sbuf2);
-             }
-         }
-     }
- }
- 
- 
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 +                                    int cutoff_scheme,
 +                                    gmx_bool bIsSimMaster)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
 +                
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE *fplog,
 +                                    int nsteps_cmdline,
 +                                    t_inputrec *ir,
 +                                    const t_commrec *cr)
 +{
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
 +                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
 +                    nsteps_cmdline);
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int cutoff_scheme; /* The cutoff scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             const char *nbpu_opt,
 +             int nsteps_cmdline, int nstepout,int resetstep,
 +             int nmultisim,int repl_ex_nst,int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool   bForceUseGPU,bTryUseGPU;
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
 +    int         nthreads_pme=1;
 +    int         nthreads_pp=1;
 +    gmx_membed_t membed=NULL;
 +    gmx_hw_info_t *hwinfo=NULL;
 +    master_inf_t minf={-1,FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    snew(state,1);
 +    if (SIMMASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
 +        }
 +
 +        /* Detect hardware, gather information. With tMPI only thread 0 does it
 +         * and after threads are started broadcasts hwinfo around. */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
 +                                  inputrec,mtop,state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr,fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf),&minf,cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +    /* Check for externally set OpenMP affinity and turn off internal
 +     * pinning if any is found. We need to do this check early to tell
 +     * thread-MPI whether it should do pinning when spawning threads.
++     * TODO: the above no longer holds, we should move these checks down
 +     */
 +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme,SIMMASTER(cr));
 +
 +#ifdef GMX_THREAD_MPI
 +        /* Early check for externally set process affinity. Can't do over all
 +         * MPI processes because hwinfo is not available everywhere, but with
 +         * thread-MPI it's needed as pinning might get turned off which needs
 +         * to be known before starting thread-MPI. */
-         check_cpu_affinity_set(fplog,
-                                NULL,
-                                hw_opt, hwinfo->nthreads_hw_avail, FALSE);
++        gmx_check_thread_affinity_set(fplog,
++                                      NULL,
++                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nbpu_opt,
 +                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Initializing membed");
 +        }
 +        membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
 +     * to the other threads  -- slightly uncool, but works fine, just need to
 +     * make sure that the data doesn't get freed twice. */
 +    if (cr->nnodes > 1)
 +    {
 +        if (!SIMMASTER(cr))
 +        {
 +            snew(hwinfo, 1);
 +        }
 +        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
 +    }
 +#else
 +    if (PAR(cr) && !SIMMASTER(cr))
 +    {
 +        /* now we have inputrec on all nodes, can run the detection */
 +        /* TODO: perhaps it's better to propagate within a node instead? */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +    }
 +
 +    /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
-     check_cpu_affinity_set(fplog, cr,
-                            hw_opt, hwinfo->nthreads_hw_avail, FALSE);
++    gmx_check_thread_affinity_set(fplog, cr,
++                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state, repl_ex_nst > 0);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(mtop->natoms,&state->edsamstate,nfile,fnm,Flags,oenv,cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
 +    /* Initialize per-physical-node MPI process/thread ID and counters. */
 +    gmx_init_intranode_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr,fplog,"Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes==1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes==1 ? "process" : "processes"
 +#endif
 +                  );
 +    fflush(stderr);
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is 
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),
 +                      nbpu_opt,
 +                      FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop,cr,FALSE);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
-     /* Before setting affinity, check whether the affinity has changed
-      * - which indicates that probably the OpenMP library has changed it since
-      * we first checked). */
-     check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
++    if (hw_opt->thread_affinity != threadaffOFF)
++    {
++        /* Before setting affinity, check whether the affinity has changed
++         * - which indicates that probably the OpenMP library has changed it
++         * since we first checked).
++         */
++        gmx_check_thread_affinity_set(fplog, cr,
++                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 +
-     /* Set the CPU affinity */
-     set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
++        /* Set the CPU affinity */
++        gmx_set_thread_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
++    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md)
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               nthreads_pp, 
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (PAR(cr) && SIMMASTER(cr))
 +#endif
 +    {
 +        gmx_hardware_info_free(hwinfo);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    }	
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}
diff --cc src/tools/levenmar.c
index 63ff58fac7,1be1cc5262..0d96233b89
--- a/src/tools/levenmar.c
+++ b/src/tools/levenmar.c
@@@ -206,65 -254,106 +206,65 @@@ static void dump_mat(int n,real **a
  
  gmx_bool gaussj(real **a, int n, real **b, int m)
  {
 -    int *indxc, *indxr, *ipiv;
 -    int  i, icol = 0, irow = 0, j, k, l, ll;
 -    real big, dum, pivinv;
 -
 -    indxc = ivector(1, n);
 -    indxr = ivector(1, n);
 -    ipiv  = ivector(1, n);
 -    for (j = 1; j <= n; j++)
 -    {
 -        ipiv[j] = 0;
 -    }
 -    for (i = 1; i <= n; i++)
 -    {
 -        big = 0.0;
 -        for (j = 1; j <= n; j++)
 -        {
 -            if (ipiv[j] != 1)
 -            {
 -                for (k = 1; k <= n; k++)
 -                {
 -                    if (ipiv[k] == 0)
 -                    {
 -                        if (fabs(a[j][k]) >= big)
 -                        {
 -                            big  = fabs(a[j][k]);
 -                            irow = j;
 -                            icol = k;
 -                        }
 -                    }
 -                    else if (ipiv[k] > 1)
 -                    {
 -                        nrerror("GAUSSJ: Singular Matrix-1", FALSE);
 -                        return FALSE;
 -                    }
 -                }
 -            }
 -        }
 -        ++(ipiv[icol]);
 -        if (irow != icol)
 -        {
 -            for (l = 1; l <= n; l++)
 -            {
 -                SWAP(a[irow][l], a[icol][l]);
 -            }
 -            for (l = 1; l <= m; l++)
 -            {
 -                SWAP(b[irow][l], b[icol][l]);
 -            }
 -        }
 -        indxr[i] = irow;
 -        indxc[i] = icol;
 -        if (a[icol][icol] == 0.0)
 -        {
 -            fprintf(stderr, "irow = %d, icol = %d\n", irow, icol);
 -            dump_mat(n, a);
 -            nrerror("GAUSSJ: Singular Matrix-2", FALSE);
 -            return FALSE;
 -        }
 -        pivinv        = 1.0/a[icol][icol];
 -        a[icol][icol] = 1.0;
 -        for (l = 1; l <= n; l++)
 -        {
 -            a[icol][l] *= pivinv;
 -        }
 -        for (l = 1; l <= m; l++)
 -        {
 -            b[icol][l] *= pivinv;
 -        }
 -        for (ll = 1; ll <= n; ll++)
 -        {
 -            if (ll != icol)
 -            {
 -                dum         = a[ll][icol];
 -                a[ll][icol] = 0.0;
 -                for (l = 1; l <= n; l++)
 -                {
 -                    a[ll][l] -= a[icol][l]*dum;
 -                }
 -                for (l = 1; l <= m; l++)
 -                {
 -                    b[ll][l] -= b[icol][l]*dum;
 -                }
 -            }
 -        }
 -    }
 -    for (l = n; l >= 1; l--)
 -    {
 -        if (indxr[l] != indxc[l])
 -        {
 -            for (k = 1; k <= n; k++)
 -            {
 -                SWAP(a[k][indxr[l]], a[k][indxc[l]]);
 -            }
 -        }
 -    }
 -    free_ivector(ipiv, 1);
 -    free_ivector(indxr, 1);
 -    free_ivector(indxc, 1);
 -
 -    return TRUE;
 +  int *indxc,*indxr,*ipiv;
 +  int i,icol=0,irow=0,j,k,l,ll;
 +  real big,dum,pivinv;
 +  
 +  indxc=ivector(1,n);
 +  indxr=ivector(1,n);
 +  ipiv=ivector(1,n);
 +  for (j=1;j<=n;j++) ipiv[j]=0;
 +  for (i=1;i<=n;i++) {
 +    big=0.0;
 +    for (j=1;j<=n;j++)
 +      if (ipiv[j] != 1)
 +	for (k=1;k<=n;k++) {
 +	  if (ipiv[k] == 0) {
 +	    if (fabs(a[j][k]) >= big) {
 +	      big=fabs(a[j][k]);
 +	      irow=j;
 +	      icol=k;
 +	    }
 +	  } else if (ipiv[k] > 1) {
 +	    nrerror("GAUSSJ: Singular Matrix-1", FALSE);
 +	    return FALSE;
 +	  }
 +	}
 +    ++(ipiv[icol]);
 +    if (irow != icol) {
-       for (l=1;l<=n;l++) SWAP(a[irow][l],a[icol][l])
- 	for (l=1;l<=m;l++) SWAP(b[irow][l],b[icol][l])
++        for (l=1;l<=n;l++) SWAP(a[irow][l],a[icol][l]);
++        for (l=1;l<=m;l++) SWAP(b[irow][l],b[icol][l]);
 +    }
 +    indxr[i]=irow;
 +    indxc[i]=icol;
 +    if (a[icol][icol] == 0.0) {
 +      fprintf(stderr,"irow = %d, icol = %d\n",irow,icol);
 +      dump_mat(n,a);
 +      nrerror("GAUSSJ: Singular Matrix-2", FALSE);
 +      return FALSE;
 +    }
 +    pivinv=1.0/a[icol][icol];
 +    a[icol][icol]=1.0;
 +    for (l=1;l<=n;l++) a[icol][l] *= pivinv;
 +    for (l=1;l<=m;l++) b[icol][l] *= pivinv;
 +    for (ll=1;ll<=n;ll++)
 +      if (ll != icol) {
 +	dum=a[ll][icol];
 +	a[ll][icol]=0.0;
 +	for (l=1;l<=n;l++) a[ll][l] -= a[icol][l]*dum;
 +	for (l=1;l<=m;l++) b[ll][l] -= b[icol][l]*dum;
 +      }
 +  }
 +  for (l=n;l>=1;l--) {
 +    if (indxr[l] != indxc[l])
 +      for (k=1;k<=n;k++)
 +	SWAP(a[k][indxr[l]],a[k][indxc[l]]);
 +  }
 +  free_ivector(ipiv,1);
 +  free_ivector(indxr,1);
 +  free_ivector(indxc,1);
 +  
 +  return TRUE;
  }
  
  #undef SWAP
@@@ -272,152 -361,231 +272,152 @@@
  
  static void covsrt(real **covar, int ma, int lista[], int mfit)
  {
 -    int  i, j;
 -    real swap;
 -
 -    for (j = 1; j < ma; j++)
 -    {
 -        for (i = j+1; i <= ma; i++)
 -        {
 -            covar[i][j] = 0.0;
 -        }
 -    }
 -    for (i = 1; i < mfit; i++)
 -    {
 -        for (j = i+1; j <= mfit; j++)
 -        {
 -            if (lista[j] > lista[i])
 -            {
 -                covar[lista[j]][lista[i]] = covar[i][j];
 -            }
 -            else
 -            {
 -                covar[lista[i]][lista[j]] = covar[i][j];
 -            }
 -        }
 -    }
 -    swap = covar[1][1];
 -    for (j = 1; j <= ma; j++)
 -    {
 -        covar[1][j] = covar[j][j];
 -        covar[j][j] = 0.0;
 -    }
 -    covar[lista[1]][lista[1]] = swap;
 -    for (j = 2; j <= mfit; j++)
 -    {
 -        covar[lista[j]][lista[j]] = covar[1][j];
 -    }
 -    for (j = 2; j <= ma; j++)
 -    {
 -        for (i = 1; i <= j-1; i++)
 -        {
 -            covar[i][j] = covar[j][i];
 -        }
 -    }
 +  int i,j;
 +  real swap;
 +  
 +  for (j=1;j<ma;j++)
 +    for (i=j+1;i<=ma;i++) covar[i][j]=0.0;
 +  for (i=1;i<mfit;i++)
 +    for (j=i+1;j<=mfit;j++) {
 +      if (lista[j] > lista[i])
 +	covar[lista[j]][lista[i]]=covar[i][j];
 +      else
 +	covar[lista[i]][lista[j]]=covar[i][j];
 +    }
 +  swap=covar[1][1];
 +  for (j=1;j<=ma;j++) {
 +    covar[1][j]=covar[j][j];
 +    covar[j][j]=0.0;
 +  }
 +  covar[lista[1]][lista[1]]=swap;
 +  for (j=2;j<=mfit;j++) covar[lista[j]][lista[j]]=covar[1][j];
 +  for (j=2;j<=ma;j++)
 +    for (i=1;i<=j-1;i++) covar[i][j]=covar[j][i];
  }
  
 -#define SWAP(a, b) {swap = (a); (a) = (b); (b) = swap; }
 +#define SWAP(a,b) {swap=(a);(a)=(b);(b)=swap;}
  
 -static void covsrt_new(real **covar, int ma, int ia[], int mfit)
 -/* Expand in storage the covariance matrix covar, so as to take
 - * into account parameters that are being held fixed. (For the
 - * latter, return zero covariances.)
 - */
 +static void covsrt_new(real **covar,int ma, int ia[], int mfit)
 +     /* Expand in storage the covariance matrix covar, so as to take 
 +      * into account parameters that are being held fixed. (For the
 +      * latter, return zero covariances.)
 +      */
  {
 -    int  i, j, k;
 -    real swap;
 -    for (i = mfit+1; i <= ma; i++)
 -    {
 -        for (j = 1; j <= i; j++)
 -        {
 -            covar[i][j] = covar[j][i] = 0.0;
 -        }
 -    }
 -    k = mfit;
 -    for (j = ma; j >= 1; j--)
 -    {
 -        if (ia[j])
 -        {
 -            for (i = 1; i <= ma; i++)
 -            {
 -                SWAP(covar[i][k], covar[i][j]);
 -            }
 -            for (i = 1; i <= ma; i++)
 -            {
 -                SWAP(covar[k][i], covar[j][i]);
 -            }
 -            k--;
 -        }
 -    }
 +  int i,j,k;
 +  real swap;
 +  for (i=mfit+1;i<=ma;i++)
 +    for (j=1;j<=i;j++) covar[i][j]=covar[j][i]=0.0;
 +  k=mfit;
 +  for (j=ma;j>=1;j--) {
 +    if (ia[j]) {
-       for (i=1;i<=ma;i++) SWAP(covar[i][k],covar[i][j])
-       for (i=1;i<=ma;i++) SWAP(covar[k][i],covar[j][i])
++        for (i=1;i<=ma;i++) SWAP(covar[i][k],covar[i][j]);
++        for (i=1;i<=ma;i++) SWAP(covar[k][i],covar[j][i]);
 +      k--;
 +    }
 +  }
  }
  #undef SWAP
 -
 -static void mrqcof(real x[], real y[], real sig[], int ndata, real a[],
 -                   int ma, int lista[], int mfit,
 -                   real **alpha, real beta[], real *chisq,
 -                   void (*funcs)(real, real *, real *, real *))
 +	
 +static void mrqcof(real x[], real y[], real sig[], int ndata, real a[], 
 +		   int ma, int lista[], int mfit, 
 +		   real **alpha, real beta[], real *chisq,
 +		   void (*funcs)(real,real *,real *,real *)) 
  {
 -    int  k, j, i;
 -    real ymod, wt, sig2i, dy, *dyda;
 -
 -    dyda = rvector(1, ma);
 -    for (j = 1; j <= mfit; j++)
 -    {
 -        for (k = 1; k <= j; k++)
 -        {
 -            alpha[j][k] = 0.0;
 -        }
 -        beta[j] = 0.0;
 -    }
 -    *chisq = 0.0;
 -    for (i = 1; i <= ndata; i++)
 -    {
 -        (*funcs)(x[i], a, &ymod, dyda);
 -        sig2i = 1.0/(sig[i]*sig[i]);
 -        dy    = y[i]-ymod;
 -        for (j = 1; j <= mfit; j++)
 -        {
 -            wt = dyda[lista[j]]*sig2i;
 -            for (k = 1; k <= j; k++)
 -            {
 -                alpha[j][k] += wt*dyda[lista[k]];
 -            }
 -            beta[j] += dy*wt;
 -        }
 -        (*chisq) += dy*dy*sig2i;
 -    }
 -    for (j = 2; j <= mfit; j++)
 -    {
 -        for (k = 1; k <= j-1; k++)
 -        {
 -            alpha[k][j] = alpha[j][k];
 -        }
 -    }
 -    free_vector(dyda, 1);
 +  int k,j,i;
 +  real ymod,wt,sig2i,dy,*dyda;
 +  
 +  dyda=rvector(1,ma);
 +  for (j=1;j<=mfit;j++) {
 +    for (k=1;k<=j;k++) alpha[j][k]=0.0;
 +    beta[j]=0.0;
 +  }
 +  *chisq=0.0;
 +  for (i=1;i<=ndata;i++) {
 +    (*funcs)(x[i],a,&ymod,dyda);
 +    sig2i=1.0/(sig[i]*sig[i]);
 +    dy=y[i]-ymod;
 +    for (j=1;j<=mfit;j++) {
 +      wt=dyda[lista[j]]*sig2i;
 +      for (k=1;k<=j;k++)
 +	alpha[j][k] += wt*dyda[lista[k]];
 +      beta[j] += dy*wt;
 +    }
 +    (*chisq) += dy*dy*sig2i;
 +  }
 +  for (j=2;j<=mfit;j++)
 +    for (k=1;k<=j-1;k++) alpha[k][j]=alpha[j][k];
 +  free_vector(dyda,1);
  }
  
 -
 -gmx_bool mrqmin(real x[], real y[], real sig[], int ndata, real a[],
 -                int ma, int lista[], int mfit,
 -                real **covar, real **alpha, real *chisq,
 -                void (*funcs)(real, real *, real *, real *),
 -                real *alamda)
 +	
 +gmx_bool mrqmin(real x[], real y[], real sig[], int ndata, real a[], 
 +	    int ma, int lista[], int mfit, 
 +	    real **covar, real **alpha, real *chisq,
 +	    void (*funcs)(real,real *,real *,real *),
 +	    real *alamda) 
  {
 -    int          k, kk, j, ihit;
 -    static real *da, *atry, **oneda, *beta, ochisq;
 -
 -    if (*alamda < 0.0)
 -    {
 -        oneda = matrix1(1, mfit, 1, 1);
 -        atry  = rvector(1, ma);
 -        da    = rvector(1, ma);
 -        beta  = rvector(1, ma);
 -        kk    = mfit+1;
 -        for (j = 1; j <= ma; j++)
 -        {
 -            ihit = 0;
 -            for (k = 1; k <= mfit; k++)
 -            {
 -                if (lista[k] == j)
 -                {
 -                    ihit++;
 -                }
 -            }
 -            if (ihit == 0)
 -            {
 -                lista[kk++] = j;
 -            }
 -            else if (ihit > 1)
 -            {
 -                nrerror("Bad LISTA permutation in MRQMIN-1", FALSE);
 -                return FALSE;
 -            }
 -        }
 -        if (kk != ma+1)
 -        {
 -            nrerror("Bad LISTA permutation in MRQMIN-2", FALSE);
 -            return FALSE;
 -        }
 -        *alamda = 0.001;
 -        mrqcof(x, y, sig, ndata, a, ma, lista, mfit, alpha, beta, chisq, funcs);
 -        ochisq = (*chisq);
 -    }
 -    for (j = 1; j <= mfit; j++)
 -    {
 -        for (k = 1; k <= mfit; k++)
 -        {
 -            covar[j][k] = alpha[j][k];
 -        }
 -        covar[j][j] = alpha[j][j]*(1.0+(*alamda));
 -        oneda[j][1] = beta[j];
 -    }
 -    if (!gaussj(covar, mfit, oneda, 1))
 -    {
 -        return FALSE;
 -    }
 -    for (j = 1; j <= mfit; j++)
 -    {
 -        da[j] = oneda[j][1];
 -    }
 -    if (*alamda == 0.0)
 -    {
 -        covsrt(covar, ma, lista, mfit);
 -        free_vector(beta, 1);
 -        free_vector(da, 1);
 -        free_vector(atry, 1);
 -        free_matrix(oneda, 1, mfit, 1);
 -        return TRUE;
 -    }
 -    for (j = 1; j <= ma; j++)
 -    {
 -        atry[j] = a[j];
 -    }
 -    for (j = 1; j <= mfit; j++)
 -    {
 -        atry[lista[j]] = a[lista[j]]+da[j];
 -    }
 -    mrqcof(x, y, sig, ndata, atry, ma, lista, mfit, covar, da, chisq, funcs);
 -    if (*chisq < ochisq)
 -    {
 -        *alamda *= 0.1;
 -        ochisq   = (*chisq);
 -        for (j = 1; j <= mfit; j++)
 -        {
 -            for (k = 1; k <= mfit; k++)
 -            {
 -                alpha[j][k] = covar[j][k];
 -            }
 -            beta[j]     = da[j];
 -            a[lista[j]] = atry[lista[j]];
 -        }
 -    }
 -    else
 -    {
 -        *alamda *= 10.0;
 -        *chisq   = ochisq;
 -    }
 +  int k,kk,j,ihit;
 +  static real *da,*atry,**oneda,*beta,ochisq;
 +  
 +  if (*alamda < 0.0) {
 +    oneda=matrix1(1,mfit,1,1);
 +    atry=rvector(1,ma);
 +    da=rvector(1,ma);
 +    beta=rvector(1,ma);
 +    kk=mfit+1;
 +    for (j=1;j<=ma;j++) {
 +      ihit=0;
 +      for (k=1;k<=mfit;k++)
 +	if (lista[k] == j) ihit++;
 +      if (ihit == 0)
 +	lista[kk++]=j;
 +      else if (ihit > 1) {
 +	nrerror("Bad LISTA permutation in MRQMIN-1", FALSE);
 +	return FALSE;
 +      }
 +    }
 +    if (kk != ma+1) {
 +      nrerror("Bad LISTA permutation in MRQMIN-2", FALSE);
 +      return FALSE;
 +    }
 +    *alamda=0.001;
 +    mrqcof(x,y,sig,ndata,a,ma,lista,mfit,alpha,beta,chisq,funcs);
 +    ochisq=(*chisq);
 +  }
 +  for (j=1;j<=mfit;j++) {
 +    for (k=1;k<=mfit;k++) covar[j][k]=alpha[j][k];
 +    covar[j][j]=alpha[j][j]*(1.0+(*alamda));
 +    oneda[j][1]=beta[j];
 +  }
 +  if (!gaussj(covar,mfit,oneda,1))
 +    return FALSE;
 +  for (j=1;j<=mfit;j++)
 +    da[j]=oneda[j][1];
 +  if (*alamda == 0.0) {
 +    covsrt(covar,ma,lista,mfit);
 +    free_vector(beta,1);
 +    free_vector(da,1);
 +    free_vector(atry,1);
 +    free_matrix(oneda,1,mfit,1);
      return TRUE;
 +  }
 +  for (j=1;j<=ma;j++) atry[j]=a[j];
 +  for (j=1;j<=mfit;j++)
 +    atry[lista[j]] = a[lista[j]]+da[j];
 +  mrqcof(x,y,sig,ndata,atry,ma,lista,mfit,covar,da,chisq,funcs);
 +  if (*chisq < ochisq) {
 +    *alamda *= 0.1;
 +    ochisq=(*chisq);
 +    for (j=1;j<=mfit;j++) {
 +      for (k=1;k<=mfit;k++) alpha[j][k]=covar[j][k];
 +      beta[j]=da[j];
 +      a[lista[j]]=atry[lista[j]];
 +    }
 +  } else {
 +    *alamda *= 10.0;
 +    *chisq=ochisq;
 +  }
 +  return TRUE;
  }