From: Roland Schulz Date: Mon, 21 Jan 2013 01:12:10 +0000 (-0500) Subject: Merge release-4-6 into master X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=e8fdd15ab94a0315827c35d16c5946e1a7b86a02;p=alexxy%2Fgromacs.git Merge release-4-6 into master Conflicts: CMakeLists.txt (just version) src/gmxlib/confio.c (just whitspace) Moved gmx_thread_affinity.[ch]. Removed from it the visibility. Doesn't include copyright and unrustify commits Made by: git merge -s ours 95761d63fb705b4334acbe2ce6ae2f384b4bcadb git merge 66ff3d6e8bf65cce7b3008e65719e333da5e83e3 git merge -s ours 3339ffd6be2736541b928446de39ce51293449c6 git merge origin/release-4-6 tree=$(git log -1 HEAD --pretty=%T) git reset --hard $(git cat-file commit HEAD | sed '1,/^$/d' | \ git commit-tree $tree -p 03d5ecd -p bb6ea98) Change-Id: Ibce56da97e270cf3f399cfdc89684917afc96a33 --- e8fdd15ab94a0315827c35d16c5946e1a7b86a02 diff --cc src/gromacs/gmxlib/confio.c index 0af17e2bb4,0000000000..4167a51b50 mode 100644,000000..100644 --- a/src/gromacs/gmxlib/confio.c +++ b/src/gromacs/gmxlib/confio.c @@@ -1,1449 -1,0 +1,1450 @@@ +/* + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * GROningen Mixture of Alchemy and Childrens' Stories + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include "sysstuff.h" +#include "typedefs.h" +#include "smalloc.h" +#include "sysstuff.h" +#include +#include "macros.h" +#include "string2.h" +#include "confio.h" +#include "vec.h" +#include "symtab.h" +#include "futil.h" +#include "xdrf.h" +#include "filenm.h" +#include "pdbio.h" +#include "tpxio.h" +#include "gmx_fatal.h" +#include "copyrite.h" +#include "filenm.h" +#include "statutil.h" +#include "pbc.h" +#include "mtop_util.h" +#include "gmxfio.h" + +#define CHAR_SHIFT 24 + +static int read_g96_pos(char line[],t_symtab *symtab, + FILE *fp,const char *infile, + t_trxframe *fr) +{ + t_atoms *atoms; + gmx_bool bEnd; + int nwanted,natoms,atnr,resnr=0,oldres,newres,shift; + char anm[STRLEN],resnm[STRLEN]; + char c1,c2; + double db1,db2,db3; + + nwanted = fr->natoms; + + atoms = fr->atoms; + + natoms = 0; + + if (fr->bX) { + if (fr->bAtoms) + shift = CHAR_SHIFT; + else + shift = 0; + newres = -1; + oldres = -666; /* Unlikely number for the first residue! */ + bEnd = FALSE; + while (!bEnd && fgets2(line,STRLEN,fp)) { + bEnd = (strncmp(line,"END",3) == 0); + if (!bEnd && (line[0] != '#')) { + if (sscanf(line+shift,"%15lf%15lf%15lf",&db1,&db2,&db3) != 3) + gmx_fatal(FARGS,"Did not find 3 coordinates for atom %d in %s\n", + natoms+1,infile); + if ((nwanted != -1) && (natoms >= nwanted)) + gmx_fatal(FARGS, + "Found more coordinates (%d) in %s than expected %d\n", + natoms,infile,nwanted); + if (atoms) { + if (fr->bAtoms && + (sscanf(line,"%5d%c%5s%c%5s%7d",&resnr,&c1,resnm,&c2,anm,&atnr) + != 6)) { + if (oldres>=0) + resnr = oldres; + else { + resnr = 1; + strncpy(resnm,"???",sizeof(resnm)-1); + } + strncpy(anm,"???",sizeof(anm)-1); + } + atoms->atomname[natoms]=put_symtab(symtab,anm); + if (resnr != oldres) { + oldres = resnr; + newres++; + if (newres >= atoms->nr) + gmx_fatal(FARGS,"More residues than atoms in %s (natoms = %d)", + infile,atoms->nr); + atoms->atom[natoms].resind = newres; + if (newres+1 > atoms->nres) { + atoms->nres = newres+1; + } + t_atoms_set_resinfo(atoms,natoms,symtab,resnm,resnr,' ',0,' '); + } else { + atoms->atom[natoms].resind = newres; + } + } + if (fr->x) { + fr->x[natoms][0] = db1; + fr->x[natoms][1] = db2; + fr->x[natoms][2] = db3; + } + natoms++; + } + } + if ((nwanted != -1) && natoms != nwanted) + fprintf(stderr, + "Warning: found less coordinates (%d) in %s than expected %d\n", + natoms,infile,nwanted); + } + + fr->natoms = natoms; + + return natoms; +} + +static int read_g96_vel(char line[],FILE *fp,const char *infile, + t_trxframe *fr) +{ + gmx_bool bEnd; + int nwanted,natoms=-1,shift; + double db1,db2,db3; + + nwanted = fr->natoms; + + if (fr->v && fr->bV) { + if (strcmp(line,"VELOCITYRED") == 0) + shift = 0; + else + shift = CHAR_SHIFT; + natoms = 0; + bEnd = FALSE; + while (!bEnd && fgets2(line,STRLEN,fp)) { + bEnd = (strncmp(line,"END",3) == 0); + if (!bEnd && (line[0] != '#')) { + if (sscanf(line+shift,"%15lf%15lf%15lf",&db1,&db2,&db3) != 3) + gmx_fatal(FARGS,"Did not find 3 velocities for atom %d in %s", + natoms+1,infile); + if ((nwanted != -1) && (natoms >= nwanted)) + gmx_fatal(FARGS,"Found more velocities (%d) in %s than expected %d\n", + natoms,infile,nwanted); + if (fr->v) { + fr->v[natoms][0] = db1; + fr->v[natoms][1] = db2; + fr->v[natoms][2] = db3; + } + natoms++; + } + } + if ((nwanted != -1) && (natoms != nwanted)) + fprintf(stderr, + "Warning: found less velocities (%d) in %s than expected %d\n", + natoms,infile,nwanted); + } + + return natoms; +} + +int read_g96_conf(FILE *fp,const char *infile,t_trxframe *fr, char *line) +{ + t_symtab *symtab=NULL; + gmx_bool bAtStart,bTime,bAtoms,bPos,bVel,bBox,bEnd,bFinished; + int natoms,nbp; + double db1,db2,db3,db4,db5,db6,db7,db8,db9; + + bAtStart = (ftell(fp) == 0); + + clear_trxframe(fr,FALSE); + + if (!symtab) { + snew(symtab,1); + open_symtab(symtab); + } + + natoms=0; + + if (bAtStart) { + while ( !fr->bTitle && fgets2(line,STRLEN,fp)) + fr->bTitle = (strcmp(line,"TITLE") == 0); + if (fr->title == NULL) { + fgets2(line,STRLEN,fp); + fr->title = strdup(line); + } + bEnd = FALSE; + while (!bEnd && fgets2(line,STRLEN,fp)) + bEnd = (strcmp(line,"END") == 0); + fgets2(line,STRLEN,fp); + } + + /* Do not get a line if we are not at the start of the file, * + * because without a parameter file we don't know what is in * + * the trajectory and we have already read the line in the * + * previous call (VERY DIRTY). */ + bFinished = FALSE; + do { + bTime = (strcmp(line,"TIMESTEP") == 0); + bAtoms = (strcmp(line,"POSITION") == 0); + bPos = (bAtoms || (strcmp(line,"POSITIONRED") == 0)); + bVel = (strncmp(line,"VELOCITY",8) == 0); + bBox = (strcmp(line,"BOX") == 0); + if (bTime) { + if (!fr->bTime && !fr->bX) { + fr->bStep = bTime; + fr->bTime = bTime; + do + bFinished = (fgets2(line,STRLEN,fp) == NULL); + while (!bFinished && (line[0] == '#')); + sscanf(line,"%15d%15lf",&(fr->step),&db1); + fr->time = db1; + } else + bFinished = TRUE; + } + if (bPos) { + if (!fr->bX) { + fr->bAtoms = bAtoms; + fr->bX = bPos; + natoms = read_g96_pos(line,symtab,fp,infile,fr); + } else + bFinished = TRUE; + } + if (fr->v && bVel) { + fr->bV = bVel; + natoms = read_g96_vel(line,fp,infile,fr); + } + if (bBox) { + fr->bBox = bBox; + clear_mat(fr->box); + bEnd = FALSE; + while (!bEnd && fgets2(line,STRLEN,fp)) { + bEnd = (strncmp(line,"END",3) == 0); + if (!bEnd && (line[0] != '#')) { + nbp = sscanf(line,"%15lf%15lf%15lf%15lf%15lf%15lf%15lf%15lf%15lf", + &db1,&db2,&db3,&db4,&db5,&db6,&db7,&db8,&db9); + if (nbp < 3) + gmx_fatal(FARGS,"Found a BOX line, but no box in %s",infile); + fr->box[XX][XX] = db1; + fr->box[YY][YY] = db2; + fr->box[ZZ][ZZ] = db3; + if (nbp == 9) { + fr->box[XX][YY] = db4; + fr->box[XX][ZZ] = db5; + fr->box[YY][XX] = db6; + fr->box[YY][ZZ] = db7; + fr->box[ZZ][XX] = db8; + fr->box[ZZ][YY] = db9; + } + } + } + bFinished = TRUE; + } + } while (!bFinished && fgets2(line,STRLEN,fp)); + + free_symtab(symtab); + + fr->natoms = natoms; + + return natoms; +} + +void write_g96_conf(FILE *out,t_trxframe *fr, + int nindex,const atom_id *index) +{ + t_atoms *atoms; + int nout,i,a; + + atoms = fr->atoms; + + if (index) + nout = nindex; + else + nout = fr->natoms; + + if (fr->bTitle) + fprintf(out,"TITLE\n%s\nEND\n",fr->title); + if (fr->bStep || fr->bTime) + /* Officially the time format is %15.9, which is not enough for 10 ns */ + fprintf(out,"TIMESTEP\n%15d%15.6f\nEND\n",fr->step,fr->time); + if (fr->bX) { + if (fr->bAtoms) { + fprintf(out,"POSITION\n"); + for(i=0; iresinfo[atoms->atom[a].resind].nr) % 100000, + *atoms->resinfo[atoms->atom[a].resind].name, + *atoms->atomname[a],(i+1) % 10000000, + fr->x[a][XX],fr->x[a][YY],fr->x[a][ZZ]); + } + } else { + fprintf(out,"POSITIONRED\n"); + for(i=0; ix[a][XX],fr->x[a][YY],fr->x[a][ZZ]); + } + } + fprintf(out,"END\n"); + } + if (fr->bV) { + if (fr->bAtoms) { + fprintf(out,"VELOCITY\n"); + for(i=0; iresinfo[atoms->atom[a].resind].nr) % 100000, + *atoms->resinfo[atoms->atom[a].resind].name, + *atoms->atomname[a],(i+1) % 10000000, + fr->v[a][XX],fr->v[a][YY],fr->v[a][ZZ]); + } + } else { + fprintf(out,"VELOCITYRED\n"); + for(i=0; iv[a][XX],fr->v[a][YY],fr->v[a][ZZ]); + } + } + fprintf(out,"END\n"); + } + if (fr->bBox) { + fprintf(out,"BOX\n"); + fprintf(out,"%15.9f%15.9f%15.9f", + fr->box[XX][XX],fr->box[YY][YY],fr->box[ZZ][ZZ]); + if (fr->box[XX][YY] || fr->box[XX][ZZ] || fr->box[YY][XX] || + fr->box[YY][ZZ] || fr->box[ZZ][XX] ||fr->box[ZZ][YY]) + fprintf(out,"%15.9f%15.9f%15.9f%15.9f%15.9f%15.9f", + fr->box[XX][YY],fr->box[XX][ZZ],fr->box[YY][XX], + fr->box[YY][ZZ],fr->box[ZZ][XX],fr->box[ZZ][YY]); + fprintf(out,"\n"); + fprintf(out,"END\n"); + } +} + +static int get_espresso_word(FILE *fp,char word[]) +{ + int ret,nc,i; + + ret = 0; + nc = 0; + + do { + i = fgetc(fp); + if (i != EOF) { + if (i == ' ' || i == '\n' || i == '\t') { + if (nc > 0) + ret = 1; + } else if (i == '{') { + if (nc == 0) + word[nc++] = '{'; + ret = 2; + } else if (i == '}') { + if (nc == 0) + word[nc++] = '}'; + ret = 3; + } else { + word[nc++] = (char)i; + } + } + } while (i != EOF && ret == 0); + + word[nc] = '\0'; + + /* printf("'%s'\n",word); */ + + return ret; +} + +static int check_open_parenthesis(FILE *fp,int r, + const char *infile,const char *keyword) +{ + int level_inc; + char word[STRLEN]; + + level_inc = 0; + if (r == 2) { + level_inc++; + } else { + r = get_espresso_word(fp,word); + if (r == 2) + level_inc++; + else + gmx_fatal(FARGS,"Expected '{' after '%s' in file '%s'", + keyword,infile); + } + + return level_inc; +} + +static int check_close_parenthesis(FILE *fp,int r, + const char *infile,const char *keyword) +{ + int level_inc; + char word[STRLEN]; + + level_inc = 0; + if (r == 3) { + level_inc--; + } else { + r = get_espresso_word(fp,word); + if (r == 3) + level_inc--; + else + gmx_fatal(FARGS,"Expected '}' after section '%s' in file '%s'", + keyword,infile); + } + + return level_inc; +} + +enum { espID, espPOS, espTYPE, espQ, espV, espF, espMOLECULE, espNR }; +const char *esp_prop[espNR] = { "id", "pos", "type", "q", "v", "f", + "molecule" }; + +static void read_espresso_conf(const char *infile, + t_atoms *atoms,rvec x[],rvec *v,matrix box) +{ + t_symtab *symtab=NULL; + FILE *fp; + char word[STRLEN],buf[STRLEN]; + int natoms,level,npar,r,nprop,p,i,m,molnr; + int prop[32]; + double d; + gmx_bool bFoundParticles,bFoundProp,bFoundVariable,bMol; + + if (!symtab) { + snew(symtab,1); + open_symtab(symtab); + } + + clear_mat(box); + + fp = gmx_fio_fopen(infile,"r"); + + bFoundParticles = FALSE; + bFoundVariable = FALSE; + bMol = FALSE; + level = 0; + while ((r=get_espresso_word(fp,word))) { + if (level==1 && strcmp(word,"particles")==0 && !bFoundParticles) { + bFoundParticles = TRUE; + level += check_open_parenthesis(fp,r,infile,"particles"); + nprop = 0; + while (level == 2 && (r=get_espresso_word(fp,word))) { + bFoundProp = FALSE; + for(p=0; p 0 && (r=get_espresso_word(fp,word))) { + if (r == 2) { + level++; + } else if (r == 3) { + level--; + } + if (level == 2) { + for(p=0; patom[i].type = strtol(word, NULL, 10); + break; + case espQ: + r = get_espresso_word(fp,word); + sscanf(word,"%lf",&d); + atoms->atom[i].q = d; + break; + case espV: + for(m=0; m<3; m++) { + r = get_espresso_word(fp,word); + sscanf(word,"%lf",&d); + v[i][m] = d; + } + break; + case espF: + for(m=0; m<3; m++) { + r = get_espresso_word(fp,word); + /* not used */ + } + break; + case espMOLECULE: + r = get_espresso_word(fp,word); + molnr = strtol(word, NULL, 10); + if (i == 0 || + atoms->resinfo[atoms->atom[i-1].resind].nr != molnr) { + atoms->atom[i].resind = + (i == 0 ? 0 : atoms->atom[i-1].resind+1); + atoms->resinfo[atoms->atom[i].resind].nr = molnr; + atoms->resinfo[atoms->atom[i].resind].ic = ' '; + atoms->resinfo[atoms->atom[i].resind].chainid = ' '; + atoms->resinfo[atoms->atom[i].resind].chainnum = molnr; /* Not sure if this is right? */ + } else { + atoms->atom[i].resind = atoms->atom[i-1].resind; + } + break; + } + } + /* Generate an atom name from the particle type */ + sprintf(buf,"T%d",atoms->atom[i].type); + atoms->atomname[i] = put_symtab(symtab,buf); + if (bMol) { + if (i == 0 || atoms->atom[i].resind != atoms->atom[i-1].resind) { + atoms->resinfo[atoms->atom[i].resind].name = + put_symtab(symtab,"MOL"); + } + } else { + /* Residue number is the atom number */ + atoms->atom[i].resind = i; + /* Generate an residue name from the particle type */ + if (atoms->atom[i].type < 26) { + sprintf(buf,"T%c",'A'+atoms->atom[i].type); + } else { + sprintf(buf,"T%c%c", + 'A'+atoms->atom[i].type/26,'A'+atoms->atom[i].type%26); + } + t_atoms_set_resinfo(atoms,i,symtab,buf,i,' ',0,' '); + } + + if (r == 3) + level--; + i++; + } + } + atoms->nres = atoms->nr; + + if (i != atoms->nr) { + gmx_fatal(FARGS,"Internal inconsistency in Espresso routines, read %d atoms, expected %d atoms",i,atoms->nr); + } + } else if (level==1 && strcmp(word,"variable")==0 && !bFoundVariable) { + bFoundVariable = TRUE; + level += check_open_parenthesis(fp,r,infile,"variable"); + while (level==2 && (r=get_espresso_word(fp,word))) { + if (level==2 && strcmp(word,"box_l") == 0) { + for(m=0; m<3; m++) { + r = get_espresso_word(fp,word); + sscanf(word,"%lf",&d); + box[m][m] = d; + } + level += check_close_parenthesis(fp,r,infile,"box_l"); + } + } + } else if (r == 2) { + level++; + } else if (r == 3) { + level--; + } + } + + if (!bFoundParticles) { + fprintf(stderr,"Did not find a particles section in Espresso file '%s'\n", + infile); + } + + gmx_fio_fclose(fp); +} + +static int get_espresso_coordnum(const char *infile) +{ + FILE *fp; + char word[STRLEN]; + int natoms,level,r; + gmx_bool bFoundParticles; + + natoms = 0; + + fp = gmx_fio_fopen(infile,"r"); + + bFoundParticles = FALSE; + level = 0; + while ((r=get_espresso_word(fp,word)) && !bFoundParticles) { + if (level==1 && strcmp(word,"particles")==0 && !bFoundParticles) { + bFoundParticles = TRUE; + level += check_open_parenthesis(fp,r,infile,"particles"); + while (level > 0 && (r=get_espresso_word(fp,word))) { + if (r == 2) { + level++; + if (level == 2) + natoms++; + } else if (r == 3) { + level--; + } + } + } else if (r == 2) { + level++; + } else if (r == 3) { + level--; + } + } + if (!bFoundParticles) { + fprintf(stderr,"Did not find a particles section in Espresso file '%s'\n", + infile); + } + + gmx_fio_fclose(fp); + + return natoms; +} + +static void write_espresso_conf_indexed(FILE *out,const char *title, + t_atoms *atoms,int nx,atom_id *index, + rvec *x,rvec *v,matrix box) +{ + int i,j; + + fprintf(out,"# %s\n",title); + if (TRICLINIC(box)) { + gmx_warning("The Espresso format does not support triclinic unit-cells"); + } + fprintf(out,"{variable {box_l %f %f %f}}\n",box[0][0],box[1][1],box[2][2]); + + fprintf(out,"{particles {id pos type q%s}\n",v ? " v" : ""); + for(i=0; iatom[j].type,atoms->atom[j].q); + if (v) + fprintf(out," %f %f %f",v[j][XX],v[j][YY],v[j][ZZ]); + fprintf(out,"}\n"); + } + fprintf(out,"}\n"); +} + +static void get_coordnum_fp (FILE *in, char *title, int *natoms) +{ + char line[STRLEN+1]; + + fgets2 (title,STRLEN,in); + fgets2 (line,STRLEN,in); + if (sscanf (line,"%d",natoms) != 1) { + gmx_fatal(FARGS,"gro file does not have the number of atoms on the second line"); + } +} + +static void get_coordnum (const char *infile,int *natoms) +{ + FILE *in; + char title[STRLEN]; + + in=gmx_fio_fopen(infile,"r"); + get_coordnum_fp(in,title,natoms); + gmx_fio_fclose (in); +} + +static gmx_bool get_w_conf(FILE *in, const char *infile, char *title, + t_symtab *symtab, t_atoms *atoms, int *ndec, + rvec x[], rvec *v, matrix box) +{ + char name[6]; + char line[STRLEN+1],*ptr; + char buf[256]; + double x1,y1,z1,x2,y2,z2; + rvec xmin,xmax; + int natoms,i,m,resnr,newres,oldres,ddist,c; + gmx_bool bFirst,bVel; + char *p1,*p2,*p3; + + newres = -1; + oldres = NOTSET; /* Unlikely number for the first residue! */ + ddist = 0; + + /* Read the title and number of atoms */ + get_coordnum_fp(in,title,&natoms); + + if (natoms > atoms->nr) + gmx_fatal(FARGS,"gro file contains more atoms (%d) than expected (%d)", + natoms,atoms->nr); + else if (natoms < atoms->nr) + fprintf(stderr,"Warning: gro file contains less atoms (%d) than expected" + " (%d)\n",natoms,atoms->nr); + + bFirst=TRUE; + + bVel = FALSE; + + /* just pray the arrays are big enough */ + for (i=0; (i < natoms) ; i++) { + if ((fgets2 (line,STRLEN,in)) == NULL) { + unexpected_eof(infile,i+2); + } + if (strlen(line) < 39) + gmx_fatal(FARGS,"Invalid line in %s for atom %d:\n%s",infile,i+1,line); + + /* determine read precision from distance between periods + (decimal points) */ + if (bFirst) { + bFirst=FALSE; + p1=strchr(line,'.'); + if (p1 == NULL) + gmx_fatal(FARGS,"A coordinate in file %s does not contain a '.'",infile); + p2=strchr(&p1[1],'.'); + if (p2 == NULL) + gmx_fatal(FARGS,"A coordinate in file %s does not contain a '.'",infile); + ddist = p2 - p1; + *ndec = ddist - 5; + + p3=strchr(&p2[1],'.'); + if (p3 == NULL) + gmx_fatal(FARGS,"A coordinate in file %s does not contain a '.'",infile); + + if (p3 - p2 != ddist) + gmx_fatal(FARGS,"The spacing of the decimal points in file %s is not consistent for x, y and z",infile); + } + + /* residue number*/ + memcpy(name,line,5); + name[5]='\0'; + sscanf(name,"%d",&resnr); + memcpy(name,line+5,5); + name[5]='\0'; + if (resnr != oldres) { + oldres = resnr; + newres++; + if (newres >= natoms) + gmx_fatal(FARGS,"More residues than atoms in %s (natoms = %d)", + infile,natoms); + atoms->atom[i].resind = newres; + t_atoms_set_resinfo(atoms,i,symtab,name,resnr,' ',0,' '); + } else { + atoms->atom[i].resind = newres; + } + + /* atomname */ + memcpy(name,line+10,5); + atoms->atomname[i]=put_symtab(symtab,name); + + /* eventueel controle atomnumber met i+1 */ + + /* coordinates (start after residue shit) */ + ptr = line + 20; + /* Read fixed format */ + for(m=0; mnres = newres + 1; + + /* box */ + fgets2 (line,STRLEN,in); + if (sscanf (line,"%lf%lf%lf",&x1,&y1,&z1) != 3) { + gmx_warning("Bad box in file %s",infile); + + /* Generate a cubic box */ + for(m=0; (mnr); i++) + for(m=0; (mnatoms; + snew(atoms.atom,fr->natoms); + atoms.nres=fr->natoms; + snew(atoms.resinfo,fr->natoms); + snew(atoms.atomname,fr->natoms); + + fr->bV = get_w_conf(status,title,title,&symtab,&atoms,&ndec,fr->x,fr->v,fr->box); + fr->bPrec = TRUE; + fr->prec = 1; + /* prec = 10^ndec: */ + for(i=0; iprec *= 10; + fr->title = title; + fr->bTitle = TRUE; + fr->bX = TRUE; + fr->bBox = TRUE; + + sfree(atoms.atom); + sfree(atoms.resinfo); + sfree(atoms.atomname); + done_symtab(&symtab); + + if ((p=strstr(title,"t=")) != NULL) { + p+=2; + if (sscanf(p,"%lf",&tt)==1) { + fr->time = tt; + fr->bTime = TRUE; + } else { + fr->time = 0; + fr->bTime = FALSE; + } + } + + if (atoms.nr != fr->natoms) + gmx_fatal(FARGS,"Number of atoms in gro frame (%d) doesn't match the number in the previous frame (%d)",atoms.nr,fr->natoms); + + return TRUE; +} + +int gro_first_x_or_v(FILE *status,t_trxframe *fr) +{ + char title[STRLEN]; + + frewind(status); + fprintf(stderr,"Reading frames from gro file"); + get_coordnum_fp(status, title, &fr->natoms); + frewind(status); + fprintf(stderr," '%s', %d atoms.\n",title, fr->natoms); + fr->bTitle = TRUE; + fr->title = title; + if (fr->natoms==0) + gmx_file("No coordinates in gro file"); + + snew(fr->x,fr->natoms); + snew(fr->v,fr->natoms); + gro_next_x_or_v(status, fr); + + return fr->natoms; +} + +static void make_hconf_format(int pr,gmx_bool bVel,char format[]) +{ + int l,vpr; + + /* build format string for printing, + something like "%8.3f" for x and "%8.4f" for v */ + if (pr<0) + pr=0; + if (pr>30) + pr=30; + l=pr+5; + vpr=pr+1; + if (bVel) + sprintf(format,"%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df\n", + l,pr,l,pr,l,pr,l,vpr,l,vpr,l,vpr); + else + sprintf(format,"%%%d.%df%%%d.%df%%%d.%df\n",l,pr,l,pr,l,pr); + +} + +static void write_hconf_box(FILE *out,int pr,matrix box) +{ + char format[100]; + int l; + + if (pr<5) + pr=5; + l=pr+5; + + if (box[XX][YY] || box[XX][ZZ] || box[YY][XX] || box[YY][ZZ] || + box[ZZ][XX] || box[ZZ][YY]) { + sprintf(format,"%%%d.%df%%%d.%df%%%d.%df" + "%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df%%%d.%df\n", + l,pr,l,pr,l,pr,l,pr,l,pr,l,pr,l,pr,l,pr,l,pr); + fprintf(out,format, + box[XX][XX],box[YY][YY],box[ZZ][ZZ], + box[XX][YY],box[XX][ZZ],box[YY][XX], + box[YY][ZZ],box[ZZ][XX],box[ZZ][YY]); + } else { + sprintf(format,"%%%d.%df%%%d.%df%%%d.%df\n",l,pr,l,pr,l,pr); + fprintf(out,format, + box[XX][XX],box[YY][YY],box[ZZ][ZZ]); + } +} + +void write_hconf_indexed_p(FILE *out,const char *title,t_atoms *atoms, + int nx,const atom_id index[], int pr, + rvec *x,rvec *v,matrix box) +{ + char resnm[6],nm[6],format[100]; + int ai,i,resind,resnr; + + bromacs(format,99); + fprintf (out,"%s\n",(title && title[0])?title:format); + fprintf (out,"%5d\n",nx); + + make_hconf_format(pr,v!=NULL,format); + + for (i=0; (iatom[ai].resind; + strncpy(resnm," ??? ",sizeof(resnm)-1); + if (resind < atoms->nres) { + strncpy(resnm,*atoms->resinfo[resind].name,sizeof(resnm)-1); + resnr = atoms->resinfo[resind].nr; + } else { + strncpy(resnm," ??? ",sizeof(resnm)-1); + resnr = resind + 1; + } + + if (atoms->atom) + strncpy(nm,*atoms->atomname[ai],sizeof(nm)-1); + else + strncpy(nm," ??? ",sizeof(nm)-1); + + fprintf(out,"%5d%-5.5s%5.5s%5d",resnr%100000,resnm,nm,(ai+1)%100000); + /* next fprintf uses built format string */ + if (v) + fprintf(out,format, + x[ai][XX], x[ai][YY], x[ai][ZZ], v[ai][XX],v[ai][YY],v[ai][ZZ]); + else + fprintf(out,format, + x[ai][XX], x[ai][YY], x[ai][ZZ]); + } + + write_hconf_box(out,pr,box); + + fflush(out); +} + +static void write_hconf_mtop(FILE *out,const char *title,gmx_mtop_t *mtop, + int pr, + rvec *x,rvec *v,matrix box) +{ + char format[100]; + int i,resnr; + gmx_mtop_atomloop_all_t aloop; + t_atom *atom; + char *atomname,*resname; + + bromacs(format,99); + fprintf (out,"%s\n",(title && title[0])?title:format); + fprintf (out,"%5d\n",mtop->natoms); + + make_hconf_format(pr,v!=NULL,format); + + aloop = gmx_mtop_atomloop_all_init(mtop); + while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) { + gmx_mtop_atomloop_all_names(aloop,&atomname,&resnr,&resname); + + fprintf(out,"%5d%-5.5s%5.5s%5d", + resnr%100000,resname,atomname,(i+1)%100000); + /* next fprintf uses built format string */ + if (v) + fprintf(out,format, + x[i][XX], x[i][YY], x[i][ZZ], v[i][XX],v[i][YY],v[i][ZZ]); + else + fprintf(out,format, + x[i][XX], x[i][YY], x[i][ZZ]); + } + + write_hconf_box(out,pr,box); + + fflush(out); +} + +void write_hconf_p(FILE *out,const char *title,t_atoms *atoms, int pr, + rvec *x,rvec *v,matrix box) +{ + atom_id *aa; + int i; + + snew(aa,atoms->nr); + for(i=0; (inr); i++) + aa[i]=i; + write_hconf_indexed_p(out,title,atoms,atoms->nr,aa,pr,x,v,box); + sfree(aa); +} + +void write_conf_p(const char *outfile, const char *title, + t_atoms *atoms, int pr, + rvec *x, rvec *v,matrix box) +{ + FILE *out; + + out=gmx_fio_fopen(outfile,"w"); + write_hconf_p(out,title,atoms,pr,x,v,box); + + gmx_fio_fclose (out); +} + +static void write_conf(const char *outfile, const char *title, t_atoms *atoms, + rvec *x, rvec *v,matrix box) +{ + write_conf_p(outfile, title, atoms, 3, x, v, box); +} + +void write_sto_conf_indexed(const char *outfile,const char *title, + t_atoms *atoms, + rvec x[],rvec *v,int ePBC,matrix box, + atom_id nindex,atom_id index[]) +{ + FILE *out; + int ftp; + t_trxframe fr; + + ftp=fn2ftp(outfile); + switch (ftp) { + case efGRO: + out=gmx_fio_fopen(outfile,"w"); + write_hconf_indexed_p(out, title, atoms, nindex, index, 3, x, v, box); + gmx_fio_fclose(out); + break; + case efG96: + clear_trxframe(&fr,TRUE); + fr.bTitle = TRUE; + fr.title = title; + fr.natoms = atoms->nr; + fr.bAtoms = TRUE; + fr.atoms = atoms; + fr.bX = TRUE; + fr.x = x; + if (v) { + fr.bV = TRUE; + fr.v = v; + } + fr.bBox = TRUE; + copy_mat(box,fr.box); + out=gmx_fio_fopen(outfile,"w"); + write_g96_conf(out, &fr, nindex, index); + gmx_fio_fclose(out); + break; + case efPDB: + case efBRK: + case efENT: + case efPQR: + out=gmx_fio_fopen(outfile,"w"); + write_pdbfile_indexed(out,title,atoms,x,ePBC,box,' ',-1,nindex,index,NULL,TRUE); + gmx_fio_fclose(out); + break; + case efESP: + out=gmx_fio_fopen(outfile,"w"); + write_espresso_conf_indexed(out, title, atoms, nindex, index, x, v, box); + gmx_fio_fclose(out); + break; + case efTPR: + case efTPB: + case efTPA: + gmx_fatal(FARGS,"Sorry, can not write a topology to %s",outfile); + break; + default: + gmx_incons("Not supported in write_sto_conf_indexed"); + } +} + +static void write_xyz_conf(const char *outfile,const char *title, + t_atoms *atoms,rvec *x) +{ + FILE *fp; + int i,anr; + real value; + char *ptr,*name; + gmx_atomprop_t aps=gmx_atomprop_init(); + + fp = gmx_fio_fopen(outfile,"w"); + fprintf(fp,"%3d\n",atoms->nr); + fprintf(fp,"%s\n",title); + for(i=0; (inr); i++) { + anr = atoms->atom[i].atomnumber; + name = *atoms->atomname[i]; + if (anr == NOTSET) { + if (gmx_atomprop_query(aps,epropElement,"???",name,&value)) + anr = gmx_nint(value); + } + if ((ptr = gmx_atomprop_element(aps,anr)) == NULL) + ptr = name; + fprintf(fp,"%3s%10.5f%10.5f%10.5f\n",ptr, + 10*x[i][XX],10*x[i][YY],10*x[i][ZZ]); + } + gmx_fio_fclose(fp); + gmx_atomprop_destroy(aps); +} + +void write_sto_conf(const char *outfile,const char *title,t_atoms *atoms, + rvec x[],rvec *v,int ePBC,matrix box) +{ + FILE *out; + int ftp; + t_trxframe fr; + + ftp=fn2ftp(outfile); + switch (ftp) { + case efGRO: + write_conf(outfile, title, atoms, x, v, box); + break; + case efG96: + clear_trxframe(&fr,TRUE); + fr.bTitle = TRUE; + fr.title = title; + fr.natoms = atoms->nr; + fr.bAtoms = TRUE; + fr.atoms = atoms; + fr.bX = TRUE; + fr.x = x; + if (v) { + fr.bV = TRUE; + fr.v = v; + } + fr.bBox = TRUE; + copy_mat(box,fr.box); + out=gmx_fio_fopen(outfile,"w"); + write_g96_conf(out, &fr, -1, NULL); + gmx_fio_fclose(out); + break; + case efXYZ: + write_xyz_conf(outfile,(strlen(title) > 0) ? title : outfile,atoms,x); + break; + case efPDB: + case efBRK: + case efENT: + out=gmx_fio_fopen(outfile,"w"); + write_pdbfile(out, title, atoms, x, ePBC, box, ' ', -1,NULL,TRUE); + gmx_fio_fclose(out); + break; + case efESP: + out=gmx_fio_fopen(outfile,"w"); + write_espresso_conf_indexed(out, title, atoms, atoms->nr, NULL, x, v, box); + gmx_fio_fclose(out); + break; + case efTPR: + case efTPB: + case efTPA: + gmx_fatal(FARGS,"Sorry, can not write a topology to %s",outfile); + break; + default: + gmx_incons("Not supported in write_sto_conf"); + } +} + +void write_sto_conf_mtop(const char *outfile,const char *title, + gmx_mtop_t *mtop, + rvec x[],rvec *v,int ePBC,matrix box) +{ + int ftp; + FILE *out; + t_atoms atoms; + + ftp=fn2ftp(outfile); + switch (ftp) { + case efGRO: + out = gmx_fio_fopen(outfile,"w"); + write_hconf_mtop(out,title,mtop,3,x,v,box); + gmx_fio_fclose(out); + break; + default: + /* This is a brute force approach which requires a lot of memory. + * We should implement mtop versions of all writing routines. + */ + atoms = gmx_mtop_global_atoms(mtop); + + write_sto_conf(outfile,title,&atoms,x,v,ePBC,box); + + done_atom(&atoms); + break; + } +} + +static int get_xyz_coordnum(const char *infile) +{ + FILE *fp; + int n; + + fp = gmx_fio_fopen(infile,"r"); + if (fscanf(fp,"%d",&n) != 1) + gmx_fatal(FARGS,"Can not read number of atoms from %s",infile); + gmx_fio_fclose(fp); + + return n; +} + +static void read_xyz_conf(const char *infile,char *title, + t_atoms *atoms,rvec *x) +{ + FILE *fp; + int i,n; + double xx,yy,zz; + t_symtab *tab; + char atomnm[32],buf[STRLEN]; + + snew(tab,1); + fp = gmx_fio_fopen(infile,"r"); + fgets2(buf,STRLEN-1,fp); + if (sscanf(buf,"%d",&n) != 1) + gmx_fatal(FARGS,"Can not read number of atoms from %s",infile); + fgets2(buf,STRLEN-1,fp); + strcpy(title,buf); + for(i=0; (iatomname[i] = put_symtab(tab,atomnm); + x[i][XX] = xx*0.1; + x[i][YY] = yy*0.1; + x[i][ZZ] = zz*0.1; + } + gmx_fio_fclose(fp); +} + +void get_stx_coordnum(const char *infile,int *natoms) +{ + FILE *in; + int ftp,tpxver,tpxgen; + t_trxframe fr; + char g96_line[STRLEN+1]; + + ftp=fn2ftp(infile); + range_check(ftp,0,efNR); + switch (ftp) { + case efGRO: + get_coordnum(infile, natoms); + break; + case efG96: + in=gmx_fio_fopen(infile,"r"); + fr.title = NULL; + fr.natoms = -1; + fr.atoms = NULL; + fr.x = NULL; + fr.v = NULL; + fr.f = NULL; + *natoms=read_g96_conf(in,infile,&fr,g96_line); + gmx_fio_fclose(in); + break; + case efXYZ: + *natoms = get_xyz_coordnum(infile); + break; + case efPDB: + case efBRK: + case efENT: + in=gmx_fio_fopen(infile,"r"); + get_pdb_coordnum(in, natoms); + gmx_fio_fclose(in); + break; + case efESP: + *natoms = get_espresso_coordnum(infile); + break; + case efTPA: + case efTPB: + case efTPR: { + t_tpxheader tpx; + + read_tpxheader(infile,&tpx,TRUE,&tpxver,&tpxgen); + *natoms = tpx.natoms; + break; + } + default: + gmx_fatal(FARGS,"File type %s not supported in get_stx_coordnum", + ftp2ext(ftp)); + } +} + +void read_stx_conf(const char *infile,char *title,t_atoms *atoms, + rvec x[],rvec *v,int *ePBC,matrix box) +{ + FILE *in; + char buf[256]; + gmx_mtop_t *mtop; + t_topology top; + t_trxframe fr; + int i,ftp,natoms; + real d; + char g96_line[STRLEN+1]; + + if (atoms->nr == 0) + fprintf(stderr,"Warning: Number of atoms in %s is 0\n",infile); + else if (atoms->atom == NULL) + gmx_mem("Uninitialized array atom"); + + if (ePBC) + *ePBC = -1; + + ftp=fn2ftp(infile); + switch (ftp) { + case efGRO: + read_whole_conf(infile, title, atoms, x, v, box); + break; + case efXYZ: + read_xyz_conf(infile,title,atoms,x); + break; + case efG96: - fr.title = title; ++ fr.title = NULL; + fr.natoms = atoms->nr; + fr.atoms = atoms; + fr.x = x; + fr.v = v; + fr.f = NULL; + in = gmx_fio_fopen(infile,"r"); + read_g96_conf(in, infile, &fr, g96_line); + gmx_fio_fclose(in); + copy_mat(fr.box,box); ++ strncpy(title, fr.title, STRLEN); + break; + case efPDB: + case efBRK: + case efENT: + read_pdb_conf(infile, title, atoms, x, ePBC, box, TRUE, NULL); + break; + case efESP: + read_espresso_conf(infile,atoms,x,v,box); + break; + case efTPR: + case efTPB: + case efTPA: + snew(mtop,1); + i = read_tpx(infile,NULL,box,&natoms,x,v,NULL,mtop); + if (ePBC) + *ePBC = i; + + strcpy(title,*(mtop->name)); + + /* Free possibly allocated memory */ + done_atom(atoms); + + *atoms = gmx_mtop_global_atoms(mtop); + top = gmx_mtop_t_to_t_topology(mtop); + tpx_make_chain_identifiers(atoms,&top.mols); + + sfree(mtop); + /* The strings in the symtab are still in use in the returned t_atoms + * structure, so we should not free them. But there is no place to put the + * symbols; the only choice is to leak the memory... + * So we clear the symbol table before freeing the topology structure. */ + free_symtab(&top.symtab); + done_top(&top); + + break; + default: + gmx_incons("Not supported in read_stx_conf"); + } +} + diff --cc src/gromacs/gmxlib/gmx_omp.c index 29622e9a7e,0000000000..a06c43d902 mode 100644,000000..100644 --- a/src/gromacs/gmxlib/gmx_omp.c +++ b/src/gromacs/gmxlib/gmx_omp.c @@@ -1,179 -1,0 +1,182 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * Written by the Gromacs development team under coordination of + * David van der Spoel, Berk Hess, and Erik Lindahl. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org + * + * And Hey: + * Gnomes, ROck Monsters And Chili Sauce + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef GMX_OPENMP +#include +#endif + +#include + +#include "md_logging.h" +#include "gmx_fatal.h" +#include "statutil.h" +#include "string2.h" +#include "gmx_omp.h" + +int gmx_omp_get_max_threads(void) +{ +#ifdef GMX_OPENMP + return omp_get_max_threads(); +#else + return 1; +#endif +} + +int gmx_omp_get_num_procs(void) +{ +#ifdef GMX_OPENMP + return omp_get_num_procs(); +#else + return 1; +#endif +} + +int gmx_omp_get_thread_num(void) +{ +#ifdef GMX_OPENMP + return omp_get_thread_num(); +#else + return 0; +#endif +} + +void gmx_omp_set_num_threads(int num_threads) +{ +#ifdef GMX_OPENMP + omp_set_num_threads(num_threads); +#else + return; +#endif +} + +/*! + * Thread affinity set by the OpenMP library can conflict with the GROMACS + * internal affinity setting. + * + * While GNU OpenMP does not set affinity by default, the Intel OpenMP library + * does. This conflicts with the internal affinity (especially thread-MPI) + * setting, results in incorrectly locked threads, and causes dreadful performance. + * + * The KMP_AFFINITY environment variable is used by Intel, GOMP_CPU_AFFINITY + * by the GNU compilers (Intel also honors it well). If any of the variables + * is set, we honor it, disable the internal pinning, and warn the user. + * When using Intel OpenMP, we will disable affinity if the user did not set it + * anually through one of the aforementioned environment variables. + * + * Note that the Intel OpenMP affinity disabling iwll only take effect if this + * function is called before the OpenMP library gets initialized which happens + * when the first call is made into a compilation unit that contains OpenMP + * pragmas. + */ +void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr, + gmx_hw_opt_t *hw_opt) +{ + gmx_bool bKmpAffinitySet, bGompCpuAffinitySet; + char *kmp_env, *gomp_env; + + /* no need to worry if internal thread pinning is turned off */ - if (!hw_opt->bThreadPinning) ++ if (hw_opt->thread_affinity == threadaffOFF) + { + return; + } + +#if defined(GMX_OPENMP) + + /* We assume that the affinity setting is available on all platforms + * gcc supports. Even if this is not the case (e.g. Mac OS) the user + * will only get a warning.*/ + bGompCpuAffinitySet = FALSE; + gomp_env = NULL; +#if defined(__GNUC__) + gomp_env = getenv("GOMP_CPU_AFFINITY"); + bGompCpuAffinitySet = (gomp_env != NULL); +#endif /* __GNUC__ */ + + bKmpAffinitySet = FALSE; +#if defined(__INTEL_COMPILER) + kmp_env = getenv("KMP_AFFINITY"); + bKmpAffinitySet = (kmp_env != NULL); + + /* disable Intel OpenMP affinity if neither KMP_AFFINITY nor + * GOMP_CPU_AFFINITY is set (Intel uses the GNU env. var as well) */ + if (!bKmpAffinitySet && !bGompCpuAffinitySet) + { + int retval; + +#ifdef _MSC_VER + /* Windows not POSIX */ + retval = _putenv_s("KMP_AFFINITY", "disabled"); +#else + /* POSIX */ + retval = setenv("KMP_AFFINITY", "disabled", 0); +#endif /* _MSC_VER */ + + if (debug) + { + fprintf(debug, "Disabling Intel OpenMP affinity by setting the KMP_AFFINITY=disabled env. var.\n"); + } + + if (retval != 0) + { + gmx_warning("Disabling Intel OpenMp affinity setting failed!"); + } + } + + /* turn off internal pinning KMP_AFFINITY != "disabled" */ + if (bKmpAffinitySet && (gmx_strncasecmp(kmp_env, "disabled", 8) != 0)) + { - md_print_warn(cr, fplog, "WARNING: KMP_AFFINITY set, will turn off %s internal affinity\n" - " setting as the two can conflict and cause performance degradation.\n" - " To keep using the %s internal affinity setting, set the\n" - " KMP_AFFINITY=disabled environment variable.", ++ /* TODO: with -pin auto we should only warn when using all cores */ ++ md_print_warn(cr, fplog, ++ "NOTE: KMP_AFFINITY set, will turn off %s internal affinity\n" ++ " setting as the two can conflict and cause performance degradation.\n" ++ " To keep using the %s internal affinity setting, set the\n" ++ " KMP_AFFINITY=disabled environment variable.", + ShortProgram(), ShortProgram()); + - hw_opt->bThreadPinning = FALSE; ++ hw_opt->thread_affinity = threadaffOFF; + } +#endif /* __INTEL_COMPILER */ + +#if defined(__INTEL_COMPILER) || defined(__GNUC__) + /* turn off internal pinning f GOMP_CPU_AFFINITY is set & non-empty */ + if (bGompCpuAffinitySet && gomp_env != NULL && gomp_env != '\0') + { ++ /* TODO: with -pin auto we should only warn when using all cores */ + md_print_warn(cr, fplog, - "WARNING: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n" - " setting as the two can conflict and cause performance degradation.\n" - " To keep using the %s internal affinity setting, unset the\n" - " GOMP_CPU_AFFINITY environment variable.", ++ "NOTE: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n" ++ " setting as the two can conflict and cause performance degradation.\n" ++ " To keep using the %s internal affinity setting, unset the\n" ++ " GOMP_CPU_AFFINITY environment variable.", + ShortProgram(), ShortProgram()); + - hw_opt->bThreadPinning = FALSE; ++ hw_opt->thread_affinity = threadaffOFF; + } +#endif /* __INTEL_COMPILER || __GNUC__ */ + +#endif /* GMX_OPENMP */ +} diff --cc src/gromacs/gmxlib/gmx_thread_affinity.c index 0000000000,0000000000..c67b961a2a new file mode 100644 --- /dev/null +++ b/src/gromacs/gmxlib/gmx_thread_affinity.c @@@ -1,0 -1,0 +1,430 @@@ ++/* ++ * This file is part of the GROMACS molecular simulation package. ++ * ++ * Copyright (c) 2012, by the GROMACS development team, led by ++ * David van der Spoel, Berk Hess, Erik Lindahl, and including many ++ * others, as listed in the AUTHORS file in the top-level source ++ * directory and at http://www.gromacs.org. ++ * ++ * GROMACS is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public License ++ * as published by the Free Software Foundation; either version 2.1 ++ * of the License, or (at your option) any later version. ++ * ++ * GROMACS is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with GROMACS; if not, see ++ * http://www.gnu.org/licenses, or write to the Free Software Foundation, ++ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * If you want to redistribute modifications to GROMACS, please ++ * consider that scientific software is very special. Version ++ * control is crucial - bugs must be traceable. We will be happy to ++ * consider code for inclusion in the official distribution, but ++ * derived work must not be called official GROMACS. Details are found ++ * in the README & COPYING files - if they are missing, get the ++ * official version at http://www.gromacs.org. ++ * ++ * To help us fund GROMACS development, we humbly ask that you cite ++ * the research papers on the package. Check out http://www.gromacs.org. ++ */ ++#ifdef HAVE_CONFIG_H ++#include ++#endif ++#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY) ++#define _GNU_SOURCE ++#include ++#include ++#endif ++#include ++#include ++#include "typedefs.h" ++#include "types/commrec.h" ++#include "types/hw_info.h" ++#include "gmx_cpuid.h" ++#include "gmx_omp.h" ++#include "gmx_omp_nthreads.h" ++#include "mdrun.h" ++#include "md_logging.h" ++#include "statutil.h" ++#include "gmx_thread_affinity.h" ++ ++#include "thread_mpi/threads.h" ++ ++ ++static int ++get_thread_affinity_layout(FILE *fplog, ++ const t_commrec *cr, ++ const gmx_hw_info_t * hwinfo, ++ int nthreads, ++ int pin_offset, int * pin_stride, ++ const int **locality_order) ++{ ++ int nhwthreads,npkg,ncores,nhwthreads_per_core,rc; ++ const int * pkg_id; ++ const int * core_id; ++ const int * hwthread_id; ++ ++ if (pin_offset < 0) ++ { ++ gmx_fatal(FARGS,"Negative thread pinning offset requested"); ++ } ++ if (*pin_stride < 0) ++ { ++ gmx_fatal(FARGS,"Negative thread pinning stride requested"); ++ } ++ ++ rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores, ++ &nhwthreads_per_core, ++ &pkg_id, &core_id, &hwthread_id, locality_order); ++ ++ if (rc != 0) ++ { ++ nhwthreads = hwinfo->nthreads_hw_avail; ++ *locality_order = NULL; ++ ++ if (nhwthreads <= 0) ++ { ++ /* We don't know anything about the hardware, don't pin */ ++ md_print_warn(cr, fplog, ++ "We don't know how many logical cores we have, will not pin threads"); ++ ++ return -1; ++ } ++ } ++ ++ if (pin_offset + nthreads > nhwthreads) ++ { ++ /* We are oversubscribing, don't pin */ ++ md_print_warn(NULL, fplog, ++ "More threads requested than available logical cores, will not pin threads"); ++ ++ return -1; ++ } ++ ++ /* Check if we need to choose the pinning stride */ ++ if (*pin_stride == 0) ++ { ++ if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads) ++ { ++ /* Put one thread on each physical core */ ++ *pin_stride = nhwthreads_per_core; ++ } ++ else ++ { ++ /* We don't know if we have SMT, and if we do, we don't know ++ * if hw threads in the same physical core are consecutive. ++ * Without SMT the pinning layout should not matter too much. ++ * so we assume a consecutive layout and maximally spread out" ++ * the threads at equal threads per core. ++ * Note that IBM is the major non-x86 case with cpuid support ++ * and probably threads are already pinned by the queuing system, ++ * so we wouldn't end up here in the first place. ++ */ ++ *pin_stride = (nhwthreads - pin_offset)/nthreads; ++ } ++ ++ if (fplog != NULL) ++ { ++ fprintf(fplog,"Pinning threads with a logical core stride of %d\n", ++ *pin_stride); ++ } ++ } ++ else ++ { ++ if (pin_offset + nthreads*(*pin_stride) > nhwthreads) ++ { ++ /* We are oversubscribing, don't pin */ ++ md_print_warn(NULL, fplog, ++ "The requested pinning stride is too large for the available logical cores, will not pin threads"); ++ ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++/* Set CPU affinity. Can be important for performance. ++ On some systems (e.g. Cray) CPU Affinity is set by default. ++ But default assigning doesn't work (well) with only some ranks ++ having threads. This causes very low performance. ++ External tools have cumbersome syntax for setting affinity ++ in the case that only some ranks have threads. ++ Thus it is important that GROMACS sets the affinity internally ++ if only PME is using threads. ++*/ ++void ++gmx_set_thread_affinity(FILE *fplog, ++ const t_commrec *cr, ++ gmx_hw_opt_t *hw_opt, ++ int nthreads_pme, ++ const gmx_hw_info_t *hwinfo, ++ const t_inputrec *inputrec) ++{ ++ int nth_affinity_set, thread_id_node, thread_id, ++ nthread_local, nthread_node, nthread_hw_max, nphyscore; ++ int offset; ++ const int *locality_order; ++ int rc; ++ ++ if (hw_opt->thread_affinity == threadaffOFF) ++ { ++ /* Nothing to do */ ++ return; ++ } ++ ++#ifndef __APPLE__ ++ /* If the tMPI thread affinity setting is not supported encourage the user ++ * to report it as it's either a bug or an exotic platform which we might ++ * want to support. */ ++ if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES) ++ { ++ md_print_warn(NULL, fplog, ++ "Can not set thread affinities on the current platform. On NUMA systems this\n" ++ "can cause performance degradation. If you think your platform should support\n" ++ "setting affinities, contact the GROMACS developers."); ++ return; ++ } ++#endif /* __APPLE__ */ ++ ++ /* threads on this MPI process or TMPI thread */ ++ if (cr->duty & DUTY_PP) ++ { ++ nthread_local = gmx_omp_nthreads_get(emntNonbonded); ++ } ++ else ++ { ++ nthread_local = gmx_omp_nthreads_get(emntPME); ++ } ++ ++ /* map the current process to cores */ ++ thread_id_node = 0; ++ nthread_node = nthread_local; ++#ifdef GMX_MPI ++ if (PAR(cr) || MULTISIM(cr)) ++ { ++ /* We need to determine a scan of the thread counts in this ++ * compute node. ++ */ ++ MPI_Comm comm_intra; ++ ++ MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode, ++ &comm_intra); ++ MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra); ++ /* MPI_Scan is inclusive, but here we need exclusive */ ++ thread_id_node -= nthread_local; ++ /* Get the total number of threads on this physical node */ ++ MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra); ++ MPI_Comm_free(&comm_intra); ++ } ++#endif ++ ++ if (hw_opt->thread_affinity == threadaffAUTO && ++ nthread_node != hwinfo->nthreads_hw_avail) ++ { ++ if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail) ++ { ++ md_print_warn(cr, fplog, ++ "NOTE: The number of threads is not equal to the number of (logical) cores\n" ++ " and the -pin option is set to auto: will not pin thread to cores.\n" ++ " This can lead to significant performance degradation.\n" ++ " Consider using -pin on (and -pinoffset in case you run multiple jobs).\n"); ++ } ++ ++ return; ++ } ++ ++ offset = 0; ++ if (hw_opt->core_pinning_offset != 0) ++ { ++ offset = hw_opt->core_pinning_offset; ++ md_print_info(cr,fplog,"Applying core pinning offset %d\n", offset); ++ } ++ ++ rc = get_thread_affinity_layout(fplog, cr, hwinfo, ++ nthread_node, ++ offset, &hw_opt->core_pinning_stride, ++ &locality_order); ++ if (rc != 0) ++ { ++ /* Incompatible layout, don't pin, warning was already issued */ ++ return; ++ } ++ ++ /* Set the per-thread affinity. In order to be able to check the success ++ * of affinity settings, we will set nth_affinity_set to 1 on threads ++ * where the affinity setting succeded and to 0 where it failed. ++ * Reducing these 0/1 values over the threads will give the total number ++ * of threads on which we succeeded. ++ */ ++ nth_affinity_set = 0; ++#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \ ++ reduction(+:nth_affinity_set) ++ { ++ int index,core; ++ gmx_bool setaffinity_ret; ++ ++ thread_id = gmx_omp_get_thread_num(); ++ thread_id_node += thread_id; ++ index = offset + thread_id_node*hw_opt->core_pinning_stride; ++ if (locality_order != NULL) ++ { ++ core = locality_order[index]; ++ } ++ else ++ { ++ core = index; ++ } ++ ++ setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core); ++ ++ /* store the per-thread success-values of the setaffinity */ ++ nth_affinity_set = (setaffinity_ret == 0); ++ ++ if (debug) ++ { ++ fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n", ++ cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret); ++ } ++ } ++ ++ if (nth_affinity_set > nthread_local) ++ { ++ char msg[STRLEN]; ++ ++ sprintf(msg, "Looks like we have set affinity for more threads than " ++ "we have (%d > %d)!\n", nth_affinity_set, nthread_local); ++ gmx_incons(msg); ++ } ++ else ++ { ++ /* check & warn if some threads failed to set their affinities */ ++ if (nth_affinity_set != nthread_local) ++ { ++ char sbuf1[STRLEN], sbuf2[STRLEN]; ++ ++ /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */ ++ sbuf1[0] = sbuf2[0] = '\0'; ++#ifdef GMX_MPI ++#ifdef GMX_THREAD_MPI ++ sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid); ++#else /* GMX_LIB_MPI */ ++ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid); ++#endif ++#endif /* GMX_MPI */ ++ ++ if (nthread_local > 1) ++ { ++ sprintf(sbuf2, "of %d/%d thread%s ", ++ nthread_local - nth_affinity_set, nthread_local, ++ (nthread_local - nth_affinity_set) > 1 ? "s" : ""); ++ } ++ ++ md_print_warn(NULL, fplog, ++ "NOTE: %sAffinity setting %sfailed.\n" ++ " This can cause performance degradation!", ++ sbuf1, sbuf2); ++ } ++ } ++} ++ ++/* Check the process affinity mask and if it is found to be non-zero, ++ * will honor it and disable mdrun internal affinity setting. ++ * Note that this will only work on Linux as we use a GNU feature. ++ */ ++void ++gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr, ++ gmx_hw_opt_t *hw_opt, int ncpus, ++ gmx_bool bAfterOpenmpInit) ++{ ++#ifdef HAVE_SCHED_GETAFFINITY ++ cpu_set_t mask_current; ++ int i, ret, cpu_count, cpu_set; ++ gmx_bool bAllSet; ++ ++ assert(hw_opt); ++ if (hw_opt->thread_affinity == threadaffOFF) ++ { ++ /* internal affinity setting is off, don't bother checking process affinity */ ++ return; ++ } ++ ++ CPU_ZERO(&mask_current); ++ if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0) ++ { ++ /* failed to query affinity mask, will just return */ ++ if (debug) ++ { ++ fprintf(debug, "Failed to query affinity mask (error %d)", ret); ++ } ++ return; ++ } ++ ++ /* Before proceeding with the actual check, make sure that the number of ++ * detected CPUs is >= the CPUs in the current set. ++ * We need to check for CPU_COUNT as it was added only in glibc 2.6. */ ++#ifdef CPU_COUNT ++ if (ncpus < CPU_COUNT(&mask_current)) ++ { ++ if (debug) ++ { ++ fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT", ++ ncpus, CPU_COUNT(&mask_current)); ++ } ++ return; ++ } ++#endif /* CPU_COUNT */ ++ ++ bAllSet = TRUE; ++ for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++) ++ { ++ bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0); ++ } ++ ++ if (!bAllSet) ++ { ++ if (hw_opt->thread_affinity == threadaffAUTO) ++ { ++ if (!bAfterOpenmpInit) ++ { ++ md_print_warn(cr, fplog, ++ "Non-default thread affinity set, disabling internal thread affinity"); ++ } ++ else ++ { ++ md_print_warn(cr, fplog, ++ "Non-default thread affinity set probably by the OpenMP library,\n" ++ "disabling internal thread affinity"); ++ } ++ hw_opt->thread_affinity = threadaffOFF; ++ } ++ else ++ { ++ /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */ ++ if (bAfterOpenmpInit) ++ { ++ md_print_warn(cr, fplog, ++ "Overriding thread affinity set outside %s\n", ++ ShortProgram()); ++ } ++ } ++ ++ if (debug) ++ { ++ fprintf(debug, "Non-default affinity mask found\n"); ++ } ++ } ++ else ++ { ++ if (debug) ++ { ++ fprintf(debug, "Default affinity mask found\n"); ++ } ++ } ++#endif /* HAVE_SCHED_GETAFFINITY */ ++} diff --cc src/gromacs/legacyheaders/gmx_thread_affinity.h index 0000000000,0000000000..c8de7bf3e1 new file mode 100644 --- /dev/null +++ b/src/gromacs/legacyheaders/gmx_thread_affinity.h @@@ -1,0 -1,0 +1,77 @@@ ++/* ++ * This file is part of the GROMACS molecular simulation package. ++ * ++ * Copyright (c) 2012, by the GROMACS development team, led by ++ * David van der Spoel, Berk Hess, Erik Lindahl, and including many ++ * others, as listed in the AUTHORS file in the top-level source ++ * directory and at http://www.gromacs.org. ++ * ++ * GROMACS is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public License ++ * as published by the Free Software Foundation; either version 2.1 ++ * of the License, or (at your option) any later version. ++ * ++ * GROMACS is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with GROMACS; if not, see ++ * http://www.gnu.org/licenses, or write to the Free Software Foundation, ++ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * If you want to redistribute modifications to GROMACS, please ++ * consider that scientific software is very special. Version ++ * control is crucial - bugs must be traceable. We will be happy to ++ * consider code for inclusion in the official distribution, but ++ * derived work must not be called official GROMACS. Details are found ++ * in the README & COPYING files - if they are missing, get the ++ * official version at http://www.gromacs.org. ++ * ++ * To help us fund GROMACS development, we humbly ask that you cite ++ * the research papers on the package. Check out http://www.gromacs.org. ++ */ ++#ifndef GMX_THREAD_AFFINITY_H_ ++#define GMX_THREAD_AFFINITY_H_ ++#include "types/commrec.h" ++#include "typedefs.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++#if 0 ++} /* fixes auto-indentation problems */ ++#endif ++ ++/* Sets the thread affinity using the requested setting stored in hw_opt. ++ * The hardware topologu is requested from hwinfo, when present. ++ */ ++void ++gmx_set_thread_affinity(FILE *fplog, ++ const t_commrec *cr, ++ gmx_hw_opt_t *hw_opt, ++ int nthreads_pme, ++ const gmx_hw_info_t *hwinfo, ++ const t_inputrec *inputrec); ++ ++/* Check the process affinity mask and if it is found to be non-zero, ++ * will honor it and disable mdrun internal affinity setting. ++ * This function should be called first before the OpenMP library gets ++ * initialized with the last argument FALSE (which will detect affinity ++ * set by external tools like taskset), and later, after the OpenMP ++ * initialization, with the last argument TRUE to detect affinity changes ++ * made by the OpenMP library. ++ * ++ * Note that this will only work on Linux as we use a GNU feature. ++ */ ++void ++gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr, ++ gmx_hw_opt_t *hw_opt, int ncpus, ++ gmx_bool bAfterOpenmpInit); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif /* GMX_THREAD_AFFINITY_H_ */ diff --cc src/gromacs/legacyheaders/mdrun.h index 3c802d57f3,0000000000..3fb195f297 mode 100644,000000..100644 --- a/src/gromacs/legacyheaders/mdrun.h +++ b/src/gromacs/legacyheaders/mdrun.h @@@ -1,206 -1,0 +1,212 @@@ +/* + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * Gromacs Runs On Most of All Computer Systems + */ + +#ifndef _mdrun_h +#define _mdrun_h + +#include +#include +#include "typedefs.h" +#include "network.h" +#include "sim_util.h" +#include "tgroup.h" +#include "filenm.h" +#include "mshift.h" +#include "force.h" +#include "edsam.h" +#include "mdebin.h" +#include "vcm.h" +#include "vsite.h" +#include "pull.h" +#include "update.h" +#include "types/membedt.h" +#include "types/globsig.h" + + +#ifdef GMX_THREAD_MPI +#include "thread_mpi/threads.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define MD_POLARISE (1<<2) +#define MD_IONIZE (1<<3) +#define MD_RERUN (1<<4) +#define MD_RERUN_VSITE (1<<5) +#define MD_FFSCAN (1<<6) +#define MD_SEPPOT (1<<7) +#define MD_PARTDEC (1<<9) +#define MD_DDBONDCHECK (1<<10) +#define MD_DDBONDCOMM (1<<11) +#define MD_CONFOUT (1<<12) +#define MD_REPRODUCIBLE (1<<13) +#define MD_READ_RNG (1<<14) +#define MD_APPENDFILES (1<<15) +#define MD_APPENDFILESSET (1<<21) +#define MD_KEEPANDNUMCPT (1<<16) +#define MD_READ_EKIN (1<<17) +#define MD_STARTFROMCPT (1<<18) +#define MD_RESETCOUNTERSHALFWAY (1<<19) +#define MD_TUNEPME (1<<20) +#define MD_TESTVERLET (1<<22) + ++/* The options for the domain decomposition MPI task ordering */ +enum { + ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR +}; + ++/* The options for the thread affinity setting, default: auto */ ++enum { ++ threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR ++}; ++ +typedef struct { + int nthreads_tot; /* Total number of threads requested (TMPI) */ + int nthreads_tmpi; /* Number of TMPI threads requested */ + int nthreads_omp; /* Number of OpenMP threads requested */ + int nthreads_omp_pme; /* As nthreads_omp, but for PME only nodes */ - gmx_bool bThreadPinning; /* Pin OpenMP threads to cores? */ - gmx_bool bPinHyperthreading; /* Pin pairs of threads to physical cores */ - int core_pinning_offset; /* Physical core pinning offset */ ++ int thread_affinity; /* Thread affinity switch, see enum above */ ++ int core_pinning_stride; /* Logical core pinning stride */ ++ int core_pinning_offset; /* Logical core pinning offset */ + char *gpu_id; /* GPU id's to use, each specified as chars */ +} gmx_hw_opt_t; + +/* Variables for temporary use with the deform option, + * used in runner.c and md.c. + * (These variables should be stored in the tpx file.) + */ +extern gmx_large_int_t deform_init_init_step_tpx; +extern matrix deform_init_box_tpx; +#ifdef GMX_THREAD_MPI +extern tMPI_Thread_mutex_t deform_init_box_mutex; + +/* The minimum number of atoms per tMPI thread. With fewer atoms than this, + * the number of threads will get lowered. + */ +#define MIN_ATOMS_PER_MPI_THREAD 90 +#define MIN_ATOMS_PER_GPU 900 +#endif + + +typedef double gmx_integrator_t(FILE *log,t_commrec *cr, + int nfile,const t_filenm fnm[], + const output_env_t oenv, gmx_bool bVerbose, + gmx_bool bCompact, int nstglobalcomm, + gmx_vsite_t *vsite,gmx_constr_t constr, + int stepout, + t_inputrec *inputrec, + gmx_mtop_t *mtop,t_fcdata *fcd, + t_state *state, + t_mdatoms *mdatoms, + t_nrnb *nrnb,gmx_wallcycle_t wcycle, + gmx_edsam_t ed, + t_forcerec *fr, + int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, + gmx_membed_t membed, + real cpt_period,real max_hours, + const char *deviceOptions, + unsigned long Flags, + gmx_runtime_t *runtime); + +/* ROUTINES from md.c */ + +gmx_integrator_t do_md; + + +/* ROUTINES from minimize.c */ + +gmx_integrator_t do_steep; +/* Do steepest descents EM */ + +gmx_integrator_t do_cg; +/* Do conjugate gradient EM */ + +gmx_integrator_t do_lbfgs; +/* Do conjugate gradient L-BFGS */ + +gmx_integrator_t do_nm; +/* Do normal mode analysis */ + +/* ROUTINES from tpi.c */ + +gmx_integrator_t do_tpi; +/* Do test particle insertion */ + +void init_npt_masses(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bInit); + +int ExpandedEnsembleDynamics(FILE *log,t_inputrec *ir, gmx_enerdata_t *enerd, + t_state *state, t_extmass *MassQ, df_history_t *dfhist, + gmx_large_int_t step, gmx_rng_t mcrng, + rvec *v, t_mdatoms *mdatoms); + +void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist, + int nlam, int frequency, gmx_large_int_t step); + +void get_mc_state(gmx_rng_t rng,t_state *state); + +void set_mc_state(gmx_rng_t rng,t_state *state); + +/* check the version */ +void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog, + t_inputrec *ir,gmx_mtop_t *mtop); + +/* Allocate and initialize node-local state entries. */ +void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes); + +/* Broadcast the data for a simulation, and allocate node-specific settings + such as rng generators. */ +void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec, + gmx_mtop_t *mtop); + +int mdrunner(gmx_hw_opt_t *hw_opt, + FILE *fplog,t_commrec *cr,int nfile, + const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, + gmx_bool bCompact, int nstglobalcomm, ivec ddxyz,int dd_node_order, + real rdd, real rconstr, const char *dddlb_opt,real dlb_scale, + const char *ddcsx,const char *ddcsy,const char *ddcsz, + const char *nbpu_opt, + int nsteps_cmdline, int nstepout, int resetstep, + int nmultisim, int repl_ex_nst, int repl_ex_nex, + int repl_ex_seed, real pforce,real cpt_period,real max_hours, + const char *deviceOptions, unsigned long Flags); +/* Driver routine, that calls the different methods */ + +#ifdef __cplusplus +} +#endif + +#endif /* _mdrun_h */ diff --cc src/programs/mdrun/mdrun.c index c801a2f684,0000000000..6286afeade mode 100644,000000..100644 --- a/src/programs/mdrun/mdrun.c +++ b/src/programs/mdrun/mdrun.c @@@ -1,740 -1,0 +1,751 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "typedefs.h" +#include "macros.h" +#include "copyrite.h" +#include "main.h" +#include "statutil.h" +#include "smalloc.h" +#include "futil.h" +#include "smalloc.h" +#include "edsam.h" +#include "mdrun.h" +#include "xmdrun.h" +#include "checkpoint.h" +#ifdef GMX_THREAD_MPI +#include "thread_mpi.h" +#endif + +/* afm stuf */ +#include "pull.h" + +int cmain(int argc,char *argv[]) +{ + const char *desc[] = { + "The [TT]mdrun[tt] program is the main computational chemistry engine", + "within GROMACS. Obviously, it performs Molecular Dynamics simulations,", + "but it can also perform Stochastic Dynamics, Energy Minimization,", + "test particle insertion or (re)calculation of energies.", + "Normal mode analysis is another option. In this case [TT]mdrun[tt]", + "builds a Hessian matrix from single conformation.", + "For usual Normal Modes-like calculations, make sure that", + "the structure provided is properly energy-minimized.", + "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]", + "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])", + "and distributes the topology over nodes if needed.", + "[TT]mdrun[tt] produces at least four output files.", + "A single log file ([TT]-g[tt]) is written, unless the option", + "[TT]-seppot[tt] is used, in which case each node writes a log file.", + "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and", + "optionally forces.", + "The structure file ([TT]-c[tt]) contains the coordinates and", + "velocities of the last step.", + "The energy file ([TT]-e[tt]) contains energies, the temperature,", + "pressure, etc, a lot of these things are also printed in the log file.", + "Optionally coordinates can be written to a compressed trajectory file", + "([TT]-x[tt]).[PAR]", + "The option [TT]-dhdl[tt] is only used when free energy calculation is", + "turned on.[PAR]", + "A simulation can be run in parallel using two different parallelization", + "schemes: MPI parallelization and/or OpenMP thread parallelization.", + "The MPI parallelization uses multiple processes when [TT]mdrun[tt] is", + "compiled with a normal MPI library or threads when [TT]mdrun[tt] is", + "compiled with the GROMACS built-in thread-MPI library. OpenMP threads", + "are supported when mdrun is compiled with OpenMP. Full OpenMP support", + "is only available with the Verlet cut-off scheme, with the (older)", + "group scheme only PME-only processes can use OpenMP parallelization.", + "In all cases [TT]mdrun[tt] will by default try to use all the available", + "hardware resources. With a normal MPI library only the options", + "[TT]-ntomp[tt] (with the Verlet cut-off scheme) and [TT]-ntomp_pme[tt],", + "for PME-only processes, can be used to control the number of threads.", + "With thread-MPI there are additional options [TT]-nt[tt], which sets", + "the total number of threads, and [TT]-ntmpi[tt], which sets the number", + "of thread-MPI threads.", + "Note that using combined MPI+OpenMP parallelization is almost always", + "slower than single parallelization, except at the scaling limit, where", + "especially OpenMP parallelization of PME reduces the communication cost.", + "OpenMP-only parallelization is much faster than MPI-only parallelization", + "on a single CPU(-die). Since we currently don't have proper hardware", + "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only", + "automatically use OpenMP-only parallelization when you use up to 4", + "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16", + "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only", + "parallelization is used (except with GPUs, see below).", + "[PAR]", + "To quickly test the performance of the new Verlet cut-off scheme", + "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use", + "the [TT]-testverlet[tt] option. This should not be used for production,", + "since it can slightly modify potentials and it will remove charge groups", + "making analysis difficult, as the [TT].tpr[tt] file will still contain", + "charge groups. For production simulations it is highly recommended", + "to specify [TT]cutoff-scheme = Verlet[tt] in the [TT].mdp[tt] file.", + "[PAR]", + "With GPUs (only supported with the Verlet cut-off scheme), the number", + "of GPUs should match the number of MPI processes or MPI threads,", + "excluding PME-only processes/threads. With thread-MPI the number", + "of MPI threads will automatically be set to the number of GPUs detected.", + "When you want to use a subset of the available GPUs, you can use", + "the [TT]-gpu_id[tt] option, where GPU id's are passed as a string,", + "e.g. 02 for using GPUs 0 and 2. When you want different GPU id's", + "on different nodes of a compute cluster, use the GMX_GPU_ID environment", + "variable instead. The format for GMX_GPU_ID is identical to ", + "[TT]-gpu_id[tt], but an environment variable can have different values", + "on different nodes of a cluster.", + "[PAR]", + "When using PME with separate PME nodes or with a GPU, the two major", + "compute tasks, the non-bonded force calculation and the PME calculation", + "run on different compute resources. If this load is not balanced,", + "some of the resources will be idle part of time. With the Verlet", + "cut-off scheme this load is automatically balanced when the PME load", + "is too high (but not when it is too low). This is done by scaling", + "the Coulomb cut-off and PME grid spacing by the same amount. In the first", + "few hundred steps different settings are tried and the fastest is chosen", + "for the rest of the simulation. This does not affect the accuracy of", + "the results, but it does affect the decomposition of the Coulomb energy", + "into particle and mesh contributions. The auto-tuning can be turned off", + "with the option [TT]-notunepme[tt].", + "[PAR]", - "When compiled with OpenMP on Linux, [TT]mdrun[tt] pins threads to cores,", ++ "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,", ++ "when all (logical) cores on a compute node are used by [TT]mdrun[tt],", ++ "even when no multi-threading is used,", + "as this usually results in significantly better performance.", - "If you don't want this, use [TT]-nopin[tt].", - "With Intel CPUs with hyper-threading enabled, you should pin", - "consecutive threads to the same physical core for optimal", - "performance when you use virtual cores. This is done automatically", - "when you use more than half of the virtual cores. It can also be set", - "manually with [TT]-pinht[tt], e.g. for running multiple simulations", - "on one compute node.", ++ "If the queuing systems or the OpenMP library pinned threads, we honor", ++ "this and don't pin again, even though the layout may be sub-optimal.", ++ "If you want to have [TT]mdrun[tt] override an already set thread affinity", ++ "or pin threads when using less cores, use [TT]-pin on[tt].", ++ "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,", ++ "there are multiple logical cores per physical core.", ++ "The option [TT]-pinstride[tt] sets the stride in logical cores for", ++ "pinning consecutive threads. Without SMT, 1 is usually the best choice.", ++ "With Intel Hyper-Threading 2 is best when using half or less of the", ++ "logical cores, 1 otherwise. The default value of 0 do exactly that:", ++ "it minimizes the threads per logical core, to optimize performance.", ++ "If you want to run multiple mdrun jobs on the same physical node," ++ "you should set [TT]-pinstride[tt] to 1 when using all logical cores.", + "When running multiple mdrun (or other) simulations on the same physical", + "node, some simulations need to start pinning from a non-zero core", + "to avoid overloading cores; with [TT]-pinoffset[tt] you can specify", - "the offset in (physical) cores for pinning.", ++ "the offset in logical cores for pinning.", + "[PAR]", + "When [TT]mdrun[tt] is started using MPI with more than 1 process", + "or with thread-MPI with more than 1 thread, MPI parallelization is used.", + "By default domain decomposition is used, unless the [TT]-pd[tt]", + "option is set, which selects particle decomposition.", + "[PAR]", + "With domain decomposition, the spatial decomposition can be set", + "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.", + "The user only needs to change this when the system is very inhomogeneous.", + "Dynamic load balancing is set with the option [TT]-dlb[tt],", + "which can give a significant performance improvement,", + "especially for inhomogeneous systems. The only disadvantage of", + "dynamic load balancing is that runs are no longer binary reproducible,", + "but in most cases this is not important.", + "By default the dynamic load balancing is automatically turned on", + "when the measured performance loss due to load imbalance is 5% or more.", + "At low parallelization these are the only important options", + "for domain decomposition.", + "At high parallelization the options in the next two sections", + "could be important for increasing the performace.", + "[PAR]", + "When PME is used with domain decomposition, separate nodes can", + "be assigned to do only the PME mesh calculation;", + "this is computationally more efficient starting at about 12 nodes.", + "The number of PME nodes is set with option [TT]-npme[tt],", + "this can not be more than half of the nodes.", + "By default [TT]mdrun[tt] makes a guess for the number of PME", + "nodes when the number of nodes is larger than 11 or performance wise", + "not compatible with the PME grid x dimension.", + "But the user should optimize npme. Performance statistics on this issue", + "are written at the end of the log file.", + "For good load balancing at high parallelization, the PME grid x and y", + "dimensions should be divisible by the number of PME nodes", + "(the simulation will run correctly also when this is not the case).", + "[PAR]", + "This section lists all options that affect the domain decomposition.", + "[PAR]", + "Option [TT]-rdd[tt] can be used to set the required maximum distance", + "for inter charge-group bonded interactions.", + "Communication for two-body bonded interactions below the non-bonded", + "cut-off distance always comes for free with the non-bonded communication.", + "Atoms beyond the non-bonded cut-off are only communicated when they have", + "missing bonded interactions; this means that the extra cost is minor", + "and nearly indepedent of the value of [TT]-rdd[tt].", + "With dynamic load balancing option [TT]-rdd[tt] also sets", + "the lower limit for the domain decomposition cell sizes.", + "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on", + "the initial coordinates. The chosen value will be a balance", + "between interaction range and communication cost.", + "[PAR]", + "When inter charge-group bonded interactions are beyond", + "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.", + "For pair interactions and tabulated bonds", + "that do not generate exclusions, this check can be turned off", + "with the option [TT]-noddcheck[tt].", + "[PAR]", + "When constraints are present, option [TT]-rcon[tt] influences", + "the cell size limit as well.", + "Atoms connected by NC constraints, where NC is the LINCS order plus 1,", + "should not be beyond the smallest cell size. A error message is", + "generated when this happens and the user should change the decomposition", + "or decrease the LINCS order and increase the number of LINCS iterations.", + "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS", + "in a conservative fashion. For high parallelization it can be useful", + "to set the distance required for P-LINCS with the option [TT]-rcon[tt].", + "[PAR]", + "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling", + "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that", + "the cells can scale down by at least this factor. This option is used", + "for the automated spatial decomposition (when not using [TT]-dd[tt])", + "as well as for determining the number of grid pulses, which in turn", + "sets the minimum allowed cell size. Under certain circumstances", + "the value of [TT]-dds[tt] might need to be adjusted to account for", + "high or low spatial inhomogeneity of the system.", + "[PAR]", + "The option [TT]-gcom[tt] can be used to only do global communication", + "every n steps.", + "This can improve performance for highly parallel simulations", + "where this global communication step becomes the bottleneck.", + "For a global thermostat and/or barostat the temperature", + "and/or pressure will also only be updated every [TT]-gcom[tt] steps.", + "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]", + "With [TT]-rerun[tt] an input trajectory can be given for which ", + "forces and energies will be (re)calculated. Neighbor searching will be", + "performed for every frame, unless [TT]nstlist[tt] is zero", + "(see the [TT].mdp[tt] file).[PAR]", + "ED (essential dynamics) sampling and/or additional flooding potentials", + "are switched on by using the [TT]-ei[tt] flag followed by an [TT].edi[tt]", + "file. The [TT].edi[tt] file can be produced with the [TT]make_edi[tt] tool", + "or by using options in the essdyn menu of the WHAT IF program.", + "[TT]mdrun[tt] produces a [TT].xvg[tt] output file that", + "contains projections of positions, velocities and forces onto selected", + "eigenvectors.[PAR]", + "When user-defined potential functions have been selected in the", + "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]", + "a formatted table with potential functions. The file is read from", + "either the current directory or from the [TT]GMXLIB[tt] directory.", + "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,", + "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with", + "normal Coulomb.", + "When pair interactions are present, a separate table for pair interaction", + "functions is read using the [TT]-tablep[tt] option.[PAR]", + "When tabulated bonded functions are present in the topology,", + "interaction functions are read using the [TT]-tableb[tt] option.", + "For each different tabulated interaction type the table file name is", + "modified in a different way: before the file extension an underscore is", + "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals", + "and finally the table number of the interaction type.[PAR]", + "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM", + "coordinates and forces when pulling is selected", + "in the [TT].mdp[tt] file.[PAR]", + "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ", + "simulated in parallel.", + "As many input files/directories are required as the number of systems. ", + "The [TT]-multidir[tt] option takes a list of directories (one for each ", + "system) and runs in each of them, using the input/output file names, ", + "such as specified by e.g. the [TT]-s[tt] option, relative to these ", + "directories.", + "With [TT]-multi[tt], the system number is appended to the run input ", + "and each output filename, for instance [TT]topol.tpr[tt] becomes", + "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.", + "The number of nodes per system is the total number of nodes", + "divided by the number of systems.", + "One use of this option is for NMR refinement: when distance", + "or orientation restraints are present these can be ensemble averaged", + "over all the systems.[PAR]", + "With [TT]-replex[tt] replica exchange is attempted every given number", + "of steps. The number of replicas is set with the [TT]-multi[tt] or ", + "[TT]-multidir[tt] option, described above.", + "All run input files should use a different coupling temperature,", + "the order of the files is not important. The random seed is set with", + "[TT]-reseed[tt]. The velocities are scaled and neighbor searching", + "is performed after every exchange.[PAR]", + "Finally some experimental algorithms can be tested when the", + "appropriate options have been given. Currently under", + "investigation are: polarizability and X-ray bombardments.", + "[PAR]", + "The option [TT]-membed[tt] does what used to be g_membed, i.e. embed", + "a protein into a membrane. The data file should contain the options", + "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]", + "both apply to this as well.", + "[PAR]", + "The option [TT]-pforce[tt] is useful when you suspect a simulation", + "crashes due to too large forces. With this option coordinates and", + "forces of atoms with a force larger than a certain value will", + "be printed to stderr.", + "[PAR]", + "Checkpoints containing the complete state of the system are written", + "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],", + "unless option [TT]-cpt[tt] is set to -1.", + "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to", + "make sure that a recent state of the system is always available,", + "even when the simulation is terminated while writing a checkpoint.", + "With [TT]-cpnum[tt] all checkpoint files are kept and appended", + "with the step number.", + "A simulation can be continued by reading the full state from file", + "with option [TT]-cpi[tt]. This option is intelligent in the way that", + "if no checkpoint file is found, Gromacs just assumes a normal run and", + "starts from the first step of the [TT].tpr[tt] file. By default the output", + "will be appending to the existing output files. The checkpoint file", + "contains checksums of all output files, such that you will never", + "loose data when some output files are modified, corrupt or removed.", + "There are three scenarios with [TT]-cpi[tt]:[PAR]", + "[TT]*[tt] no files with matching names are present: new output files are written[PAR]", + "[TT]*[tt] all files are present with names and checksums matching those stored", + "in the checkpoint file: files are appended[PAR]", + "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]", + "With [TT]-noappend[tt] new output files are opened and the simulation", + "part number is added to all output file names.", + "Note that in all cases the checkpoint file itself is not renamed", + "and will be overwritten, unless its name does not match", + "the [TT]-cpo[tt] option.", + "[PAR]", + "With checkpointing the output is appended to previously written", + "output files, unless [TT]-noappend[tt] is used or none of the previous", + "output files are present (except for the checkpoint file).", + "The integrity of the files to be appended is verified using checksums", + "which are stored in the checkpoint file. This ensures that output can", + "not be mixed up or corrupted due to file appending. When only some", + "of the previous output files are present, a fatal error is generated", + "and no old output files are modified and no new output files are opened.", + "The result with appending will be the same as from a single run.", + "The contents will be binary identical, unless you use a different number", + "of nodes or dynamic load balancing or the FFT library uses optimizations", + "through timing.", + "[PAR]", + "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint", + "file is written at the first neighbor search step where the run time", + "exceeds [TT]-maxh[tt]*0.99 hours.", + "[PAR]", + "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current", + "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is", + "pressed), it will stop after the next neighbor search step ", + "(with nstlist=0 at the next step).", + "In both cases all the usual output will be written to file.", + "When running with MPI, a signal to one of the [TT]mdrun[tt] processes", + "is sufficient, this signal should not be sent to mpirun or", + "the [TT]mdrun[tt] process that is the parent of the others.", + "[PAR]", + "When [TT]mdrun[tt] is started with MPI, it does not run niced by default." + }; + t_commrec *cr; + t_filenm fnm[] = { + { efTPX, NULL, NULL, ffREAD }, + { efTRN, "-o", NULL, ffWRITE }, + { efXTC, "-x", NULL, ffOPTWR }, + { efCPT, "-cpi", NULL, ffOPTRD }, + { efCPT, "-cpo", NULL, ffOPTWR }, + { efSTO, "-c", "confout", ffWRITE }, + { efEDR, "-e", "ener", ffWRITE }, + { efLOG, "-g", "md", ffWRITE }, + { efXVG, "-dhdl", "dhdl", ffOPTWR }, + { efXVG, "-field", "field", ffOPTWR }, + { efXVG, "-table", "table", ffOPTRD }, + { efXVG, "-tabletf", "tabletf", ffOPTRD }, + { efXVG, "-tablep", "tablep", ffOPTRD }, + { efXVG, "-tableb", "table", ffOPTRD }, + { efTRX, "-rerun", "rerun", ffOPTRD }, + { efXVG, "-tpi", "tpi", ffOPTWR }, + { efXVG, "-tpid", "tpidist", ffOPTWR }, + { efEDI, "-ei", "sam", ffOPTRD }, + { efXVG, "-eo", "edsam", ffOPTWR }, + { efGCT, "-j", "wham", ffOPTRD }, + { efGCT, "-jo", "bam", ffOPTWR }, + { efXVG, "-ffout", "gct", ffOPTWR }, + { efXVG, "-devout", "deviatie", ffOPTWR }, + { efXVG, "-runav", "runaver", ffOPTWR }, + { efXVG, "-px", "pullx", ffOPTWR }, + { efXVG, "-pf", "pullf", ffOPTWR }, + { efXVG, "-ro", "rotation", ffOPTWR }, + { efLOG, "-ra", "rotangles",ffOPTWR }, + { efLOG, "-rs", "rotslabs", ffOPTWR }, + { efLOG, "-rt", "rottorque",ffOPTWR }, + { efMTX, "-mtx", "nm", ffOPTWR }, + { efNDX, "-dn", "dipole", ffOPTWR }, + { efRND, "-multidir",NULL, ffOPTRDMULT}, + { efDAT, "-membed", "membed", ffOPTRD }, + { efTOP, "-mp", "membed", ffOPTRD }, + { efNDX, "-mn", "membed", ffOPTRD } + }; +#define NFILE asize(fnm) + + /* Command line options ! */ + gmx_bool bCart = FALSE; + gmx_bool bPPPME = FALSE; + gmx_bool bPartDec = FALSE; + gmx_bool bDDBondCheck = TRUE; + gmx_bool bDDBondComm = TRUE; + gmx_bool bTunePME = TRUE; + gmx_bool bTestVerlet = FALSE; + gmx_bool bVerbose = FALSE; + gmx_bool bCompact = TRUE; + gmx_bool bSepPot = FALSE; + gmx_bool bRerunVSite = FALSE; + gmx_bool bIonize = FALSE; + gmx_bool bConfout = TRUE; + gmx_bool bReproducible = FALSE; + + int npme=-1; + int nmultisim=0; + int nstglobalcomm=-1; + int repl_ex_nst=0; + int repl_ex_seed=-1; + int repl_ex_nex=0; + int nstepout=100; + int resetstep=-1; + int nsteps=-2; /* the value -2 means that the mdp option will be used */ + + rvec realddxyz={0,0,0}; + const char *ddno_opt[ddnoNR+1] = + { NULL, "interleave", "pp_pme", "cartesian", NULL }; + const char *dddlb_opt[] = + { NULL, "auto", "no", "yes", NULL }; ++ const char *thread_aff_opt[threadaffNR+1] = ++ { NULL, "auto", "on", "off", NULL }; + const char *nbpu_opt[] = + { NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL }; + real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1; + char *ddcsx=NULL,*ddcsy=NULL,*ddcsz=NULL; + real cpt_period=15.0,max_hours=-1; + gmx_bool bAppendFiles=TRUE; + gmx_bool bKeepAndNumCPT=FALSE; + gmx_bool bResetCountersHalfWay=FALSE; + output_env_t oenv=NULL; + const char *deviceOptions = ""; + - gmx_hw_opt_t hw_opt={0,0,0,0,TRUE,FALSE,0,NULL}; ++ gmx_hw_opt_t hw_opt={0,0,0,0,threadaffSEL,0,0,NULL}; + + t_pargs pa[] = { + + { "-pd", FALSE, etBOOL,{&bPartDec}, + "Use particle decompostion" }, + { "-dd", FALSE, etRVEC,{&realddxyz}, + "Domain decomposition grid, 0 is optimize" }, + { "-ddorder", FALSE, etENUM, {ddno_opt}, + "DD node order" }, + { "-npme", FALSE, etINT, {&npme}, + "Number of separate nodes to be used for PME, -1 is guess" }, + { "-nt", FALSE, etINT, {&hw_opt.nthreads_tot}, + "Total number of threads to start (0 is guess)" }, + { "-ntmpi", FALSE, etINT, {&hw_opt.nthreads_tmpi}, + "Number of thread-MPI threads to start (0 is guess)" }, + { "-ntomp", FALSE, etINT, {&hw_opt.nthreads_omp}, + "Number of OpenMP threads per MPI process/thread to start (0 is guess)" }, + { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme}, + "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" }, - { "-pin", FALSE, etBOOL, {&hw_opt.bThreadPinning}, - "Pin OpenMP threads to cores" }, - { "-pinht", FALSE, etBOOL, {&hw_opt.bPinHyperthreading}, - "Always pin threads to Hyper-Threading cores" }, ++ { "-pin", FALSE, etENUM, {thread_aff_opt}, ++ "Fix threads (or processes) to specific cores" }, + { "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset}, - "Core offset for pinning (for running multiple mdrun processes on a single physical node)" }, ++ "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" }, ++ { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride}, ++ "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" }, + { "-gpu_id", FALSE, etSTR, {&hw_opt.gpu_id}, + "List of GPU id's to use" }, + { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck}, + "Check for all bonded interactions with DD" }, + { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm}, + "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" }, + { "-rdd", FALSE, etREAL, {&rdd}, + "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" }, + { "-rcon", FALSE, etREAL, {&rconstr}, + "Maximum distance for P-LINCS (nm), 0 is estimate" }, + { "-dlb", FALSE, etENUM, {dddlb_opt}, + "Dynamic load balancing (with DD)" }, + { "-dds", FALSE, etREAL, {&dlb_scale}, + "Minimum allowed dlb scaling of the DD cell size" }, + { "-ddcsx", FALSE, etSTR, {&ddcsx}, + "HIDDENThe DD cell sizes in x" }, + { "-ddcsy", FALSE, etSTR, {&ddcsy}, + "HIDDENThe DD cell sizes in y" }, + { "-ddcsz", FALSE, etSTR, {&ddcsz}, + "HIDDENThe DD cell sizes in z" }, + { "-gcom", FALSE, etINT,{&nstglobalcomm}, + "Global communication frequency" }, + { "-nb", FALSE, etENUM, {&nbpu_opt}, + "Calculate non-bonded interactions on" }, + { "-tunepme", FALSE, etBOOL, {&bTunePME}, + "Optimize PME load between PP/PME nodes or GPU/CPU" }, + { "-testverlet", FALSE, etBOOL, {&bTestVerlet}, + "Test the Verlet non-bonded scheme" }, + { "-v", FALSE, etBOOL,{&bVerbose}, + "Be loud and noisy" }, + { "-compact", FALSE, etBOOL,{&bCompact}, + "Write a compact log file" }, + { "-seppot", FALSE, etBOOL, {&bSepPot}, + "Write separate V and dVdl terms for each interaction type and node to the log file(s)" }, + { "-pforce", FALSE, etREAL, {&pforce}, + "Print all forces larger than this (kJ/mol nm)" }, + { "-reprod", FALSE, etBOOL,{&bReproducible}, + "Try to avoid optimizations that affect binary reproducibility" }, + { "-cpt", FALSE, etREAL, {&cpt_period}, + "Checkpoint interval (minutes)" }, + { "-cpnum", FALSE, etBOOL, {&bKeepAndNumCPT}, + "Keep and number checkpoint files" }, + { "-append", FALSE, etBOOL, {&bAppendFiles}, + "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" }, + { "-nsteps", FALSE, etINT, {&nsteps}, + "Run this number of steps, overrides .mdp file option" }, + { "-maxh", FALSE, etREAL, {&max_hours}, + "Terminate after 0.99 times this time (hours)" }, + { "-multi", FALSE, etINT,{&nmultisim}, + "Do multiple simulations in parallel" }, + { "-replex", FALSE, etINT, {&repl_ex_nst}, + "Attempt replica exchange periodically with this period (steps)" }, + { "-nex", FALSE, etINT, {&repl_ex_nex}, + "Number of random exchanges to carry out each exchange interval (N^3 is one suggestion). -nex zero or not specified gives neighbor replica exchange." }, + { "-reseed", FALSE, etINT, {&repl_ex_seed}, + "Seed for replica exchange, -1 is generate a seed" }, + { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite}, + "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" }, + { "-ionize", FALSE, etBOOL,{&bIonize}, + "Do a simulation including the effect of an X-Ray bombardment on your system" }, + { "-confout", FALSE, etBOOL, {&bConfout}, + "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" }, + { "-stepout", FALSE, etINT, {&nstepout}, + "HIDDENFrequency of writing the remaining runtime" }, + { "-resetstep", FALSE, etINT, {&resetstep}, + "HIDDENReset cycle counters after these many time steps" }, + { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay}, + "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" } + }; + gmx_edsam_t ed; + unsigned long Flags, PCA_Flags; + ivec ddxyz; + int dd_node_order; + gmx_bool bAddPart; + FILE *fplog,*fpmulti; + int sim_part,sim_part_fn; + const char *part_suffix=".part"; + char suffix[STRLEN]; + int rc; + char **multidir=NULL; + + + cr = init_par(&argc,&argv); + + if (MASTER(cr)) + CopyRight(stderr, argv[0]); + + PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET)); + + /* Comment this in to do fexist calls only on master + * works not with rerun or tables at the moment + * also comment out the version of init_forcerec in md.c + * with NULL instead of opt2fn + */ + /* + if (!MASTER(cr)) + { + PCA_Flags |= PCA_NOT_READ_NODE; + } + */ + + parse_common_args(&argc,argv,PCA_Flags, NFILE,fnm,asize(pa),pa, + asize(desc),desc,0,NULL, &oenv); + + - + /* we set these early because they might be used in init_multisystem() + Note that there is the potential for npme>nnodes until the number of + threads is set later on, if there's thread parallelization. That shouldn't + lead to problems. */ + dd_node_order = nenum(ddno_opt); + cr->npmenodes = npme; + ++ hw_opt.thread_affinity = nenum(thread_aff_opt); ++ + /* now check the -multi and -multidir option */ + if (opt2bSet("-multidir", NFILE, fnm)) + { + int i; + if (nmultisim > 0) + { + gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive."); + } + nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm); + } + + + if (repl_ex_nst != 0 && nmultisim < 2) + gmx_fatal(FARGS,"Need at least two replicas for replica exchange (option -multi)"); + + if (repl_ex_nex < 0) + gmx_fatal(FARGS,"Replica exchange number of exchanges needs to be positive"); + + if (nmultisim > 1) { +#ifndef GMX_THREAD_MPI + gmx_bool bParFn = (multidir == NULL); + init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn); +#else + gmx_fatal(FARGS,"mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support"); +#endif + } + + bAddPart = !bAppendFiles; + + /* Check if there is ANY checkpoint file available */ + sim_part = 1; + sim_part_fn = sim_part; + if (opt2bSet("-cpi",NFILE,fnm)) + { + if (bSepPot && bAppendFiles) + { + gmx_fatal(FARGS,"Output file appending is not supported with -seppot"); + } + + bAppendFiles = + read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE, + fnm,cr), + &sim_part_fn,NULL,cr, + bAppendFiles,NFILE,fnm, + part_suffix,&bAddPart); + if (sim_part_fn==0 && MULTIMASTER(cr)) + { + fprintf(stdout,"No previous checkpoint file present, assuming this is a new run.\n"); + } + else + { + sim_part = sim_part_fn + 1; + } + + if (MULTISIM(cr) && MASTER(cr)) + { + if (MULTIMASTER(cr)) + { + /* Log file is not yet available, so if there's a + * problem we can only write to stderr. */ + fpmulti = stderr; + } + else + { + fpmulti = NULL; + } + check_multi_int(fpmulti,cr->ms,sim_part,"simulation part",TRUE); + } + } + else + { + bAppendFiles = FALSE; + } + + if (!bAppendFiles) + { + sim_part_fn = sim_part; + } + + if (bAddPart) + { + /* Rename all output files (except checkpoint files) */ + /* create new part name first (zero-filled) */ + sprintf(suffix,"%s%04d",part_suffix,sim_part_fn); + + add_suffix_to_output_names(fnm,NFILE,suffix); + if (MULTIMASTER(cr)) + { + fprintf(stdout,"Checkpoint file is from part %d, new output files will be suffixed '%s'.\n",sim_part-1,suffix); + } + } + + Flags = opt2bSet("-rerun",NFILE,fnm) ? MD_RERUN : 0; + Flags = Flags | (bSepPot ? MD_SEPPOT : 0); + Flags = Flags | (bIonize ? MD_IONIZE : 0); + Flags = Flags | (bPartDec ? MD_PARTDEC : 0); + Flags = Flags | (bDDBondCheck ? MD_DDBONDCHECK : 0); + Flags = Flags | (bDDBondComm ? MD_DDBONDCOMM : 0); + Flags = Flags | (bTunePME ? MD_TUNEPME : 0); + Flags = Flags | (bTestVerlet ? MD_TESTVERLET : 0); + Flags = Flags | (bConfout ? MD_CONFOUT : 0); + Flags = Flags | (bRerunVSite ? MD_RERUN_VSITE : 0); + Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0); + Flags = Flags | (bAppendFiles ? MD_APPENDFILES : 0); + Flags = Flags | (opt2parg_bSet("-append", asize(pa),pa) ? MD_APPENDFILESSET : 0); + Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0); + Flags = Flags | (sim_part>1 ? MD_STARTFROMCPT : 0); + Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0); + + + /* We postpone opening the log file if we are appending, so we can + first truncate the old log file and append to the correct position + there instead. */ + if ((MASTER(cr) || bSepPot) && !bAppendFiles) + { + gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr, + !bSepPot,Flags & MD_APPENDFILES,&fplog); + CopyRight(fplog,argv[0]); + please_cite(fplog,"Hess2008b"); + please_cite(fplog,"Spoel2005a"); + please_cite(fplog,"Lindahl2001a"); + please_cite(fplog,"Berendsen95a"); + } + else if (!MASTER(cr) && bSepPot) + { + gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog); + } + else + { + fplog = NULL; + } + + ddxyz[XX] = (int)(realddxyz[XX] + 0.5); + ddxyz[YY] = (int)(realddxyz[YY] + 0.5); + ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5); + + rc = mdrunner(&hw_opt, fplog,cr,NFILE,fnm,oenv,bVerbose,bCompact, + nstglobalcomm, ddxyz,dd_node_order,rdd,rconstr, + dddlb_opt[0],dlb_scale,ddcsx,ddcsy,ddcsz, + nbpu_opt[0], + nsteps,nstepout,resetstep, + nmultisim,repl_ex_nst,repl_ex_nex,repl_ex_seed, + pforce, cpt_period,max_hours,deviceOptions,Flags); + + gmx_finalize_par(); + + if (MULTIMASTER(cr)) { + thanx(stderr); + } + + /* Log file has to be closed in mdrunner if we are appending to it + (fplog not set here) */ + if (MASTER(cr) && !bAppendFiles) + { + gmx_log_close(fplog); + } + + return rc; +} + diff --cc src/programs/mdrun/runner.c index 40b48cd691,0000000000..6b712de3d3 mode 100644,000000..100644 --- a/src/programs/mdrun/runner.c +++ b/src/programs/mdrun/runner.c @@@ -1,1990 -1,0 +1,1683 @@@ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- + * + * + * This source code is part of + * + * G R O M A C S + * + * GROningen MAchine for Chemical Simulations + * + * VERSION 3.2.0 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team, + * check out http://www.gromacs.org for more information. + + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * If you want to redistribute modifications, please consider that + * scientific software is very special. Version control is crucial - + * bugs must be traceable. We will be happy to consider code for + * inclusion in the official distribution, but derived work must not + * be called official GROMACS. Details are found in the README & COPYING + * files - if they are missing, get the official version at www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the papers on the package - you can find them in the top README file. + * + * For more info, check our website at http://www.gromacs.org + * + * And Hey: + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon + */ +#ifdef HAVE_CONFIG_H +#include +#endif - #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY) - #define _GNU_SOURCE - #include - #include - #endif +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#include + +#include "typedefs.h" +#include "smalloc.h" +#include "sysstuff.h" +#include "statutil.h" +#include "mdrun.h" +#include "md_logging.h" +#include "md_support.h" +#include "network.h" +#include "pull.h" +#include "pull_rotation.h" +#include "names.h" +#include "disre.h" +#include "orires.h" +#include "pme.h" +#include "mdatoms.h" +#include "repl_ex.h" +#include "qmmm.h" +#include "domdec.h" +#include "partdec.h" +#include "coulomb.h" +#include "constr.h" +#include "mvdata.h" +#include "checkpoint.h" +#include "mtop_util.h" +#include "sighandler.h" +#include "tpxio.h" +#include "txtdump.h" +#include "gmx_detect_hardware.h" +#include "gmx_omp_nthreads.h" +#include "pull_rotation.h" +#include "calc_verletbuf.h" +#include "../mdlib/nbnxn_search.h" +#include "../mdlib/nbnxn_consts.h" +#include "gmx_fatal_collective.h" +#include "membed.h" +#include "macros.h" +#include "gmx_omp.h" - - #include "thread_mpi/threads.h" ++#include "gmx_thread_affinity.h" + +#ifdef GMX_LIB_MPI +#include +#endif +#ifdef GMX_THREAD_MPI +#include "tmpi.h" +#endif + +#ifdef GMX_FAHCORE +#include "corewrap.h" +#endif + +#include "gpu_utils.h" +#include "nbnxn_cuda_data_mgmt.h" + +typedef struct { + gmx_integrator_t *func; +} gmx_intp_t; + +/* The array should match the eI array in include/types/enums.h */ +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}}; + +gmx_large_int_t deform_init_init_step_tpx; +matrix deform_init_box_tpx; +#ifdef GMX_THREAD_MPI +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER; +#endif + + +#ifdef GMX_THREAD_MPI +struct mdrunner_arglist +{ + gmx_hw_opt_t *hw_opt; + FILE *fplog; + t_commrec *cr; + int nfile; + const t_filenm *fnm; + output_env_t oenv; + gmx_bool bVerbose; + gmx_bool bCompact; + int nstglobalcomm; + ivec ddxyz; + int dd_node_order; + real rdd; + real rconstr; + const char *dddlb_opt; + real dlb_scale; + const char *ddcsx; + const char *ddcsy; + const char *ddcsz; + const char *nbpu_opt; + int nsteps_cmdline; + int nstepout; + int resetstep; + int nmultisim; + int repl_ex_nst; + int repl_ex_nex; + int repl_ex_seed; + real pforce; + real cpt_period; + real max_hours; + const char *deviceOptions; + unsigned long Flags; + int ret; /* return value */ +}; + + +/* The function used for spawning threads. Extracts the mdrunner() + arguments from its one argument and calls mdrunner(), after making + a commrec. */ +static void mdrunner_start_fn(void *arg) +{ + struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg; + struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure + that it's thread-local. This doesn't + copy pointed-to items, of course, + but those are all const. */ + t_commrec *cr; /* we need a local version of this */ + FILE *fplog=NULL; + t_filenm *fnm; + + fnm = dup_tfn(mc.nfile, mc.fnm); + + cr = init_par_threads(mc.cr); + + if (MASTER(cr)) + { + fplog=mc.fplog; + } + + mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, + mc.bVerbose, mc.bCompact, mc.nstglobalcomm, + mc.ddxyz, mc.dd_node_order, mc.rdd, + mc.rconstr, mc.dddlb_opt, mc.dlb_scale, + mc.ddcsx, mc.ddcsy, mc.ddcsz, + mc.nbpu_opt, + mc.nsteps_cmdline, mc.nstepout, mc.resetstep, + mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, + mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags); +} + +/* called by mdrunner() to start a specific number of threads (including + the main thread) for thread-parallel runs. This in turn calls mdrunner() + for each thread. + All options besides nthreads are the same as for mdrunner(). */ +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, + FILE *fplog,t_commrec *cr,int nfile, + const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, + gmx_bool bCompact, int nstglobalcomm, + ivec ddxyz,int dd_node_order,real rdd,real rconstr, + const char *dddlb_opt,real dlb_scale, + const char *ddcsx,const char *ddcsy,const char *ddcsz, + const char *nbpu_opt, + int nsteps_cmdline, int nstepout,int resetstep, + int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed, + real pforce,real cpt_period, real max_hours, + const char *deviceOptions, unsigned long Flags) +{ + int ret; + struct mdrunner_arglist *mda; + t_commrec *crn; /* the new commrec */ + t_filenm *fnmn; + + /* first check whether we even need to start tMPI */ + if (hw_opt->nthreads_tmpi < 2) + { + return cr; + } + + /* a few small, one-time, almost unavoidable memory leaks: */ + snew(mda,1); + fnmn=dup_tfn(nfile, fnm); + + /* fill the data structure to pass as void pointer to thread start fn */ + mda->hw_opt=hw_opt; + mda->fplog=fplog; + mda->cr=cr; + mda->nfile=nfile; + mda->fnm=fnmn; + mda->oenv=oenv; + mda->bVerbose=bVerbose; + mda->bCompact=bCompact; + mda->nstglobalcomm=nstglobalcomm; + mda->ddxyz[XX]=ddxyz[XX]; + mda->ddxyz[YY]=ddxyz[YY]; + mda->ddxyz[ZZ]=ddxyz[ZZ]; + mda->dd_node_order=dd_node_order; + mda->rdd=rdd; + mda->rconstr=rconstr; + mda->dddlb_opt=dddlb_opt; + mda->dlb_scale=dlb_scale; + mda->ddcsx=ddcsx; + mda->ddcsy=ddcsy; + mda->ddcsz=ddcsz; + mda->nbpu_opt=nbpu_opt; + mda->nsteps_cmdline=nsteps_cmdline; + mda->nstepout=nstepout; + mda->resetstep=resetstep; + mda->nmultisim=nmultisim; + mda->repl_ex_nst=repl_ex_nst; + mda->repl_ex_nex=repl_ex_nex; + mda->repl_ex_seed=repl_ex_seed; + mda->pforce=pforce; + mda->cpt_period=cpt_period; + mda->max_hours=max_hours; + mda->deviceOptions=deviceOptions; + mda->Flags=Flags; + + /* now spawn new threads that start mdrunner_start_fn(), while - the main thread returns */ - ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, - (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE), ++ the main thread returns, we set thread affinity later */ ++ ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE, + mdrunner_start_fn, (void*)(mda) ); + if (ret!=TMPI_SUCCESS) + return NULL; + + /* make a new comm_rec to reflect the new situation */ + crn=init_par_threads(cr); + return crn; +} + + +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, + const gmx_hw_opt_t *hw_opt, + int nthreads_tot, + int ngpu) +{ + int nthreads_tmpi; + + /* There are no separate PME nodes here, as we ensured in + * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes + * and a conditional ensures we would not have ended up here. + * Note that separate PME nodes might be switched on later. + */ + if (ngpu > 0) + { + nthreads_tmpi = ngpu; + if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi) + { + nthreads_tmpi = nthreads_tot; + } + } + else if (hw_opt->nthreads_omp > 0) + { + /* Here we could oversubscribe, when we do, we issue a warning later */ + nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp); + } + else + { + /* TODO choose nthreads_omp based on hardware topology + when we have a hardware topology detection library */ + /* In general, when running up to 4 threads, OpenMP should be faster. + * Note: on AMD Bulldozer we should avoid running OpenMP over two dies. + * On Intel>=Nehalem running OpenMP on a single CPU is always faster, + * even on two CPUs it's usually faster (but with many OpenMP threads + * it could be faster not to use HT, currently we always use HT). + * On Nehalem/Westmere we want to avoid running 16 threads over + * two CPUs with HT, so we need a limit<16; thus we use 12. + * A reasonable limit for Intel Sandy and Ivy bridge, + * not knowing the topology, is 16 threads. + */ + const int nthreads_omp_always_faster = 4; + const int nthreads_omp_always_faster_Nehalem = 12; + const int nthreads_omp_always_faster_SandyBridge = 16; + const int first_model_Nehalem = 0x1A; + const int first_model_SandyBridge = 0x2A; + gmx_bool bIntel_Family6; + + bIntel_Family6 = + (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL && + gmx_cpuid_family(hwinfo->cpuid_info) == 6); + + if (nthreads_tot <= nthreads_omp_always_faster || + (bIntel_Family6 && + ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) || + (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge)))) + { + /* Use pure OpenMP parallelization */ + nthreads_tmpi = 1; + } + else + { + /* Don't use OpenMP parallelization */ + nthreads_tmpi = nthreads_tot; + } + } + + return nthreads_tmpi; +} + + +/* Get the number of threads to use for thread-MPI based on how many + * were requested, which algorithms we're using, + * and how many particles there are. + * At the point we have already called check_and_update_hw_opt. + * Thus all options should be internally consistent and consistent + * with the hardware, except that ntmpi could be larger than #GPU. + */ +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo, + gmx_hw_opt_t *hw_opt, + t_inputrec *inputrec, gmx_mtop_t *mtop, + const t_commrec *cr, + FILE *fplog) +{ + int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu; + int min_atoms_per_mpi_thread; + char *env; + char sbuf[STRLEN]; + gmx_bool bCanUseGPU; + + if (hw_opt->nthreads_tmpi > 0) + { + /* Trivial, return right away */ + return hw_opt->nthreads_tmpi; + } + + nthreads_hw = hwinfo->nthreads_hw_avail; + + /* How many total (#tMPI*#OpenMP) threads can we start? */ + if (hw_opt->nthreads_tot > 0) + { + nthreads_tot_max = hw_opt->nthreads_tot; + } + else + { + nthreads_tot_max = nthreads_hw; + } + + bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU); + if (bCanUseGPU) + { + ngpu = hwinfo->gpu_info.ncuda_dev_use; + } + else + { + ngpu = 0; + } + + nthreads_tmpi = + get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu); + + if (inputrec->eI == eiNM || EI_TPI(inputrec->eI)) + { + /* Steps are divided over the nodes iso splitting the atoms */ + min_atoms_per_mpi_thread = 0; + } + else + { + if (bCanUseGPU) + { + min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU; + } + else + { + min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD; + } + } + + /* Check if an algorithm does not support parallel simulation. */ + if (nthreads_tmpi != 1 && + ( inputrec->eI == eiLBFGS || + inputrec->coulombtype == eelEWALD ) ) + { + nthreads_tmpi = 1; + + md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n"); + if (hw_opt->nthreads_tmpi > nthreads_tmpi) + { + gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that"); + } + } + else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread) + { + /* the thread number was chosen automatically, but there are too many + threads (too few atoms per thread) */ + nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread); + + /* Avoid partial use of Hyper-Threading */ + if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED && + nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw) + { + nthreads_new = nthreads_hw/2; + } + + /* Avoid large prime numbers in the thread count */ + if (nthreads_new >= 6) + { + /* Use only 6,8,10 with additional factors of 2 */ + int fac; + + fac = 2; + while (3*fac*2 <= nthreads_new) + { + fac *= 2; + } + + nthreads_new = (nthreads_new/fac)*fac; + } + else + { + /* Avoid 5 */ + if (nthreads_new == 5) + { + nthreads_new = 4; + } + } + + nthreads_tmpi = nthreads_new; + + fprintf(stderr,"\n"); + fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n"); + fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi); + fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n"); + } + + return nthreads_tmpi; +} +#endif /* GMX_THREAD_MPI */ + + +/* Environment variable for setting nstlist */ +static const char* NSTLIST_ENVVAR = "GMX_NSTLIST"; +/* Try to increase nstlist when using a GPU with nstlist less than this */ +static const int NSTLIST_GPU_ENOUGH = 20; +/* Increase nstlist until the non-bonded cost increases more than this factor */ +static const float NBNXN_GPU_LIST_OK_FAC = 1.25; +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */ +static const float NBNXN_GPU_LIST_MAX_FAC = 1.40; + +/* Try to increase nstlist when running on a GPU */ +static void increase_nstlist(FILE *fp,t_commrec *cr, + t_inputrec *ir,const gmx_mtop_t *mtop,matrix box) +{ + char *env; + int nstlist_orig,nstlist_prev; + verletbuf_list_setup_t ls; + real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev; + int i; + t_state state_tmp; + gmx_bool bBox,bDD,bCont; + const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n"; + const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used"; + const char *box_err="Can not increase nstlist for GPU run because the box is too small"; + const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations"; + char buf[STRLEN]; + + /* Number of + nstlist alternative values to try when switching */ + const int nstl[]={ 20, 25, 40, 50 }; +#define NNSTL sizeof(nstl)/sizeof(nstl[0]) + + env = getenv(NSTLIST_ENVVAR); + if (env == NULL) + { + if (fp != NULL) + { + fprintf(fp,nstl_fmt,ir->nstlist); + } + } + + if (ir->verletbuf_drift == 0) + { + gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp"); + } + + if (ir->verletbuf_drift < 0) + { + if (MASTER(cr)) + { + fprintf(stderr,"%s\n",vbd_err); + } + if (fp != NULL) + { + fprintf(fp,"%s\n",vbd_err); + } + + return; + } + + nstlist_orig = ir->nstlist; + if (env != NULL) + { + sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env); + if (MASTER(cr)) + { + fprintf(stderr,"%s\n",buf); + } + if (fp != NULL) + { + fprintf(fp,"%s\n",buf); + } + sscanf(env,"%d",&ir->nstlist); + } + + verletbuf_get_list_setup(TRUE,&ls); + + /* Allow rlist to make the list double the size of the cut-off sphere */ + rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box)); + rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc; + rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc; + if (debug) + { + fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n", + rlist_inc,rlist_max); + } + + i = 0; + nstlist_prev = nstlist_orig; + rlist_prev = ir->rlist; + do + { + if (env == NULL) + { + ir->nstlist = nstl[i]; + } + + /* Set the pair-list buffer size in ir */ + calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls, + NULL,&rlist_new); + + /* Does rlist fit in the box? */ + bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box)); + bDD = TRUE; + if (bBox && DOMAINDECOMP(cr)) + { + /* Check if rlist fits in the domain decomposition */ + if (inputrec2nboundeddim(ir) < DIM) + { + gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet"); + } + copy_mat(box,state_tmp.box); + bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new); + } + + bCont = FALSE; + + if (env == NULL) + { + if (bBox && bDD && rlist_new <= rlist_max) + { + /* Increase nstlist */ + nstlist_prev = ir->nstlist; + rlist_prev = rlist_new; + bCont = (i+1 < NNSTL && rlist_new < rlist_ok); + } + else + { + /* Stick with the previous nstlist */ + ir->nstlist = nstlist_prev; + rlist_new = rlist_prev; + bBox = TRUE; + bDD = TRUE; + } + } + + i++; + } + while (bCont); + + if (!bBox || !bDD) + { + gmx_warning(!bBox ? box_err : dd_err); + if (fp != NULL) + { + fprintf(fp,"\n%s\n",bBox ? box_err : dd_err); + } + ir->nstlist = nstlist_orig; + } + else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist) + { + sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g", + nstlist_orig,ir->nstlist, + ir->rlist,rlist_new); + if (MASTER(cr)) + { + fprintf(stderr,"%s\n\n",buf); + } + if (fp != NULL) + { + fprintf(fp,"%s\n\n",buf); + } + ir->rlist = rlist_new; + ir->rlistlong = rlist_new; + } +} + +static void prepare_verlet_scheme(FILE *fplog, + gmx_hw_info_t *hwinfo, + t_commrec *cr, + gmx_hw_opt_t *hw_opt, + const char *nbpu_opt, + t_inputrec *ir, + const gmx_mtop_t *mtop, + matrix box, + gmx_bool *bUseGPU) +{ + /* Here we only check for GPU usage on the MPI master process, + * as here we don't know how many GPUs we will use yet. + * We check for a GPU on all processes later. + */ + *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL); + + if (ir->verletbuf_drift > 0) + { + /* Update the Verlet buffer size for the current run setup */ + verletbuf_list_setup_t ls; + real rlist_new; + + /* Here we assume CPU acceleration is on. But as currently + * calc_verlet_buffer_size gives the same results for 4x8 and 4x4 + * and 4x2 gives a larger buffer than 4x4, this is ok. + */ + verletbuf_get_list_setup(*bUseGPU,&ls); + + calc_verlet_buffer_size(mtop,det(box),ir, + ir->verletbuf_drift,&ls, + NULL,&rlist_new); + if (rlist_new != ir->rlist) + { + if (fplog != NULL) + { + fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n", + ir->rlist,rlist_new, + ls.cluster_size_i,ls.cluster_size_j); + } + ir->rlist = rlist_new; + ir->rlistlong = rlist_new; + } + } + + /* With GPU or emulation we should check nstlist for performance */ + if ((EI_DYNAMICS(ir->eI) && + *bUseGPU && + ir->nstlist < NSTLIST_GPU_ENOUGH) || + getenv(NSTLIST_ENVVAR) != NULL) + { + /* Choose a better nstlist */ + increase_nstlist(fplog,cr,ir,mtop,box); + } +} + +static void convert_to_verlet_scheme(FILE *fplog, + t_inputrec *ir, + gmx_mtop_t *mtop,real box_vol) +{ + char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme"; + + md_print_warn(NULL,fplog,"%s\n",conv_mesg); + + ir->cutoff_scheme = ecutsVERLET; + ir->verletbuf_drift = 0.005; + + if (ir->rcoulomb != ir->rvdw) + { + gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs"); + } + + if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype)) + { + gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme"); + } + else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype)) + { + md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials"); + + if (EVDW_SWITCHED(ir->vdwtype)) + { + ir->vdwtype = evdwCUT; + } + if (EEL_SWITCHED(ir->coulombtype)) + { + if (EEL_FULL(ir->coulombtype)) + { + /* With full electrostatic only PME can be switched */ + ir->coulombtype = eelPME; + } + else + { + md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]); + ir->coulombtype = eelRF; + ir->epsilon_rf = 0.0; + } + } + + /* We set the target energy drift to a small number. + * Note that this is only for testing. For production the user + * should think about this and set the mdp options. + */ + ir->verletbuf_drift = 1e-4; + } + + if (inputrec2nboundeddim(ir) != 3) + { + gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc"); + } + + if (ir->efep != efepNO || ir->implicit_solvent != eisNO) + { + gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent"); + } + + if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO)) + { + verletbuf_list_setup_t ls; + + verletbuf_get_list_setup(FALSE,&ls); + calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls, + NULL,&ir->rlist); + } + else + { + ir->verletbuf_drift = -1; + ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb); + } + + gmx_mtop_remove_chargegroups(mtop); +} + - /* Check the process affinity mask. If it is non-zero, something - * else has set the affinity, and mdrun should honor that and - * not attempt to do its own thread pinning. - * - * This function should be called twice. Once before the OpenMP - * library gets initialized with bAfterOpenMPInit=FALSE (which will - * detect affinity set by external tools like taskset), and again - * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE - * (which will detect affinity changes made by the OpenMP library). - * - * Note that this will only work on Linux, because we use a GNU - * feature. */ - static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr, - gmx_hw_opt_t *hw_opt, int ncpus, - gmx_bool bAfterOpenmpInit) - { - #ifdef HAVE_SCHED_GETAFFINITY - cpu_set_t mask_current; - int i, ret, cpu_count, cpu_set; - gmx_bool bAllSet; - - assert(hw_opt); - if (!hw_opt->bThreadPinning) - { - /* internal affinity setting is off, don't bother checking process affinity */ - return; - } - - CPU_ZERO(&mask_current); - if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0) - { - /* failed to query affinity mask, will just return */ - if (debug) - { - fprintf(debug, "Failed to query affinity mask (error %d)", ret); - } - return; - } - - /* Before proceeding with the actual check, make sure that the number of - * detected CPUs is >= the CPUs in the current set. - * We need to check for CPU_COUNT as it was added only in glibc 2.6. */ - #ifdef CPU_COUNT - if (ncpus < CPU_COUNT(&mask_current)) - { - if (debug) - { - fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT", - ncpus, CPU_COUNT(&mask_current)); - } - return; - } - #endif /* CPU_COUNT */ - - bAllSet = TRUE; - for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++) - { - bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0); - } - - if (!bAllSet) - { - if (!bAfterOpenmpInit) - { - md_print_warn(cr, fplog, - "%s detected a non-default process affinity, " - "so it will not attempt to pin its threads", ShortProgram()); - } - else - { - md_print_warn(cr, fplog, - "%s detected a non-default process affinity, " - "probably set by the OpenMP library, " - "so it will not attempt to pin its threads", ShortProgram()); - } - hw_opt->bThreadPinning = FALSE; - - if (debug) - { - fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n"); - } - } - else - { - if (debug) - { - fprintf(debug, "Default affinity mask found\n"); - } - } - #endif /* HAVE_SCHED_GETAFFINITY */ - } - - /* Set CPU affinity. Can be important for performance. - On some systems (e.g. Cray) CPU Affinity is set by default. - But default assigning doesn't work (well) with only some ranks - having threads. This causes very low performance. - External tools have cumbersome syntax for setting affinity - in the case that only some ranks have threads. - Thus it is important that GROMACS sets the affinity internally - if only PME is using threads. - */ - static void set_cpu_affinity(FILE *fplog, - const t_commrec *cr, - gmx_hw_opt_t *hw_opt, - int nthreads_pme, - const gmx_hw_info_t *hwinfo, - const t_inputrec *inputrec) - { - #if defined GMX_THREAD_MPI - /* With the number of TMPI threads equal to the number of cores - * we already pinned in thread-MPI, so don't pin again here. - */ - if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number()) - { - return; - } - #endif - - #ifndef __APPLE__ - /* If the tMPI thread affinity setting is not supported encourage the user - * to report it as it's either a bug or an exotic platform which we might - * want to support. */ - if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES) - { - md_print_warn(NULL, fplog, - "Can not set thread affinities on the current plarform. On NUMA systems this\n" - "can cause performance degradation. If you think your platform should support\n" - "setting affinities, contact the GROMACS developers."); - return; - } - #endif /* __APPLE__ */ - - if (hw_opt->bThreadPinning) - { - int nth_affinity_set, thread_id_node, thread_id, - nthread_local, nthread_node, nthread_hw_max, nphyscore; - int offset; - char *env; - - /* threads on this MPI process or TMPI thread */ - if (cr->duty & DUTY_PP) - { - nthread_local = gmx_omp_nthreads_get(emntNonbonded); - } - else - { - nthread_local = gmx_omp_nthreads_get(emntPME); - } - - /* map the current process to cores */ - thread_id_node = 0; - nthread_node = nthread_local; - #ifdef GMX_MPI - if (PAR(cr) || MULTISIM(cr)) - { - /* We need to determine a scan of the thread counts in this - * compute node. - */ - MPI_Comm comm_intra; - - MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode, - &comm_intra); - MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra); - /* MPI_Scan is inclusive, but here we need exclusive */ - thread_id_node -= nthread_local; - /* Get the total number of threads on this physical node */ - MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra); - MPI_Comm_free(&comm_intra); - } - #endif - - offset = 0; - if (hw_opt->core_pinning_offset > 0) - { - offset = hw_opt->core_pinning_offset; - if (SIMMASTER(cr)) - { - fprintf(stderr, "Applying core pinning offset %d\n", offset); - } - if (fplog) - { - fprintf(fplog, "Applying core pinning offset %d\n", offset); - } - } - - /* With Intel Hyper-Threading enabled, we want to pin consecutive - * threads to physical cores when using more threads than physical - * cores or when the user requests so. - */ - nthread_hw_max = hwinfo->nthreads_hw_avail; - nphyscore = -1; - if (hw_opt->bPinHyperthreading || - (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED && - nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL)) - { - if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED) - { - /* We print to stderr on all processes, as we might have - * different settings on different physical nodes. - */ - if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL) - { - md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, " - "but non-Intel CPU detected (vendor: %s)\n", - gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]); - } - else - { - md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, " - "but the CPU detected does not have Intel Hyper-Threading support " - "(or it is turned off)\n"); - } - } - nphyscore = nthread_hw_max/2; - - if (SIMMASTER(cr)) - { - fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n", - nphyscore); - } - if (fplog) - { - fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n", - nphyscore); - } - } - - /* Set the per-thread affinity. In order to be able to check the success - * of affinity settings, we will set nth_affinity_set to 1 on threads - * where the affinity setting succeded and to 0 where it failed. - * Reducing these 0/1 values over the threads will give the total number - * of threads on which we succeeded. - */ - nth_affinity_set = 0; - #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \ - reduction(+:nth_affinity_set) - { - int core; - gmx_bool setaffinity_ret; - - thread_id = gmx_omp_get_thread_num(); - thread_id_node += thread_id; - if (nphyscore <= 0) - { - core = offset + thread_id_node; - } - else - { - /* Lock pairs of threads to the same hyperthreaded core */ - core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore; - } - - setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core); - - /* store the per-thread success-values of the setaffinity */ - nth_affinity_set = (setaffinity_ret == 0); - - if (debug) - { - fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n", - cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret); - } - } - - if (nth_affinity_set > nthread_local) - { - char msg[STRLEN]; - - sprintf(msg, "Looks like we have set affinity for more threads than " - "we have (%d > %d)!\n", nth_affinity_set, nthread_local); - gmx_incons(msg); - } - else - { - /* check & warn if some threads failed to set their affinities */ - if (nth_affinity_set != nthread_local) - { - char sbuf1[STRLEN], sbuf2[STRLEN]; - - /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */ - sbuf1[0] = sbuf2[0] = '\0'; - #ifdef GMX_MPI - #ifdef GMX_THREAD_MPI - sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid); - #else /* GMX_LIB_MPI */ - sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid); - #endif - #endif /* GMX_MPI */ - - if (nthread_local > 1) - { - sprintf(sbuf2, "of %d/%d thread%s ", - nthread_local - nth_affinity_set, nthread_local, - (nthread_local - nth_affinity_set) > 1 ? "s" : ""); - } - - md_print_warn(NULL, fplog, - "NOTE: %sAffinity setting %sfailed.\n" - " This can cause performance degradation!", - sbuf1, sbuf2); - } - } - } - } - - +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt, + int cutoff_scheme, + gmx_bool bIsSimMaster) +{ + gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster); + +#ifndef GMX_THREAD_MPI + if (hw_opt->nthreads_tot > 0) + { + gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI"); + } + if (hw_opt->nthreads_tmpi > 0) + { + gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI"); + } +#endif + + if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0) + { + /* We have the same number of OpenMP threads for PP and PME processes, + * thus we can perform several consistency checks. + */ + if (hw_opt->nthreads_tmpi > 0 && + hw_opt->nthreads_omp > 0 && + hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp) + { + gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested", + hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp); + } + + if (hw_opt->nthreads_tmpi > 0 && + hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0) + { + gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)", + hw_opt->nthreads_tot,hw_opt->nthreads_tmpi); + } + + if (hw_opt->nthreads_omp > 0 && + hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0) + { + gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)", + hw_opt->nthreads_tot,hw_opt->nthreads_omp); + } + + if (hw_opt->nthreads_tmpi > 0 && + hw_opt->nthreads_omp <= 0) + { + hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; + } + } + +#ifndef GMX_OPENMP + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support"); + } +#endif + + if (cutoff_scheme == ecutsGROUP) + { + /* We only have OpenMP support for PME only nodes */ + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s", + ecutscheme_names[cutoff_scheme], + ecutscheme_names[ecutsVERLET]); + } + hw_opt->nthreads_omp = 1; + } + + if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0) + { + gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme"); + } + + if (hw_opt->nthreads_tot == 1) + { + hw_opt->nthreads_tmpi = 1; + + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads", + hw_opt->nthreads_tmpi,hw_opt->nthreads_tot); + } + hw_opt->nthreads_omp = 1; + } + + if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) + { + hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; + } + + if (debug) + { + fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n", + hw_opt->nthreads_tot, + hw_opt->nthreads_tmpi, + hw_opt->nthreads_omp, + hw_opt->nthreads_omp_pme, + hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : ""); + + } +} + + +/* Override the value in inputrec with value passed on the command line (if any) */ +static void override_nsteps_cmdline(FILE *fplog, + int nsteps_cmdline, + t_inputrec *ir, + const t_commrec *cr) +{ + assert(ir); + assert(cr); + + /* override with anything else than the default -2 */ + if (nsteps_cmdline > -2) + { + char stmp[STRLEN]; + + ir->nsteps = nsteps_cmdline; + if (EI_DYNAMICS(ir->eI)) + { + sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps", + nsteps_cmdline, nsteps_cmdline*ir->delta_t); + } + else + { + sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps", + nsteps_cmdline); + } + + md_print_warn(cr, fplog, "%s\n", stmp); + } +} + +/* Data structure set by SIMMASTER which needs to be passed to all nodes + * before the other nodes have read the tpx file and called gmx_detect_hardware. + */ +typedef struct { + int cutoff_scheme; /* The cutoff scheme from inputrec_t */ + gmx_bool bUseGPU; /* Use GPU or GPU emulation */ +} master_inf_t; + +int mdrunner(gmx_hw_opt_t *hw_opt, + FILE *fplog,t_commrec *cr,int nfile, + const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, + gmx_bool bCompact, int nstglobalcomm, + ivec ddxyz,int dd_node_order,real rdd,real rconstr, + const char *dddlb_opt,real dlb_scale, + const char *ddcsx,const char *ddcsy,const char *ddcsz, + const char *nbpu_opt, + int nsteps_cmdline, int nstepout,int resetstep, + int nmultisim,int repl_ex_nst,int repl_ex_nex, + int repl_ex_seed, real pforce,real cpt_period,real max_hours, + const char *deviceOptions, unsigned long Flags) +{ + gmx_bool bForceUseGPU,bTryUseGPU; + double nodetime=0,realtime; + t_inputrec *inputrec; + t_state *state=NULL; + matrix box; + gmx_ddbox_t ddbox={0}; + int npme_major,npme_minor; + real tmpr1,tmpr2; + t_nrnb *nrnb; + gmx_mtop_t *mtop=NULL; + t_mdatoms *mdatoms=NULL; + t_forcerec *fr=NULL; + t_fcdata *fcd=NULL; + real ewaldcoeff=0; + gmx_pme_t *pmedata=NULL; + gmx_vsite_t *vsite=NULL; + gmx_constr_t constr; + int i,m,nChargePerturbed=-1,status,nalloc; + char *gro; + gmx_wallcycle_t wcycle; + gmx_bool bReadRNG,bReadEkin; + int list; + gmx_runtime_t runtime; + int rc; + gmx_large_int_t reset_counters; + gmx_edsam_t ed=NULL; + t_commrec *cr_old=cr; + int nthreads_pme=1; + int nthreads_pp=1; + gmx_membed_t membed=NULL; + gmx_hw_info_t *hwinfo=NULL; + master_inf_t minf={-1,FALSE}; + + /* CAUTION: threads may be started later on in this function, so + cr doesn't reflect the final parallel state right now */ + snew(inputrec,1); + snew(mtop,1); + + if (Flags & MD_APPENDFILES) + { + fplog = NULL; + } + + bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0); + bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU; + + snew(state,1); + if (SIMMASTER(cr)) + { + /* Read (nearly) all data required for the simulation */ + read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop); + + if (inputrec->cutoff_scheme != ecutsVERLET && + ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL)) + { + convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box)); + } + + /* Detect hardware, gather information. With tMPI only thread 0 does it + * and after threads are started broadcasts hwinfo around. */ + snew(hwinfo, 1); + gmx_detect_hardware(fplog, hwinfo, cr, + bForceUseGPU, bTryUseGPU, hw_opt->gpu_id); + + minf.cutoff_scheme = inputrec->cutoff_scheme; + minf.bUseGPU = FALSE; + + if (inputrec->cutoff_scheme == ecutsVERLET) + { + prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt, + inputrec,mtop,state->box, + &minf.bUseGPU); + } + else if (hwinfo->bCanUseGPU) + { + md_print_warn(cr,fplog, + "NOTE: GPU(s) found, but the current simulation can not use GPUs\n" + " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n" + " (for quick performance testing you can use the -testverlet option)\n"); + + if (bForceUseGPU) + { + gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet"); + } + } + } +#ifndef GMX_THREAD_MPI + if (PAR(cr)) + { + gmx_bcast_sim(sizeof(minf),&minf,cr); + } +#endif + if (minf.bUseGPU && cr->npmenodes == -1) + { + /* Don't automatically use PME-only nodes with GPUs */ + cr->npmenodes = 0; + } + + /* Check for externally set OpenMP affinity and turn off internal + * pinning if any is found. We need to do this check early to tell + * thread-MPI whether it should do pinning when spawning threads. ++ * TODO: the above no longer holds, we should move these checks down + */ + gmx_omp_check_thread_affinity(fplog, cr, hw_opt); + +#ifdef GMX_THREAD_MPI + /* With thread-MPI inputrec is only set here on the master thread */ + if (SIMMASTER(cr)) +#endif + { + check_and_update_hw_opt(hw_opt,minf.cutoff_scheme,SIMMASTER(cr)); + +#ifdef GMX_THREAD_MPI + /* Early check for externally set process affinity. Can't do over all + * MPI processes because hwinfo is not available everywhere, but with + * thread-MPI it's needed as pinning might get turned off which needs + * to be known before starting thread-MPI. */ - check_cpu_affinity_set(fplog, - NULL, - hw_opt, hwinfo->nthreads_hw_avail, FALSE); ++ gmx_check_thread_affinity_set(fplog, ++ NULL, ++ hw_opt, hwinfo->nthreads_hw_avail, FALSE); +#endif + +#ifdef GMX_THREAD_MPI + if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0) + { + gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes"); + } +#endif + + if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp && + cr->npmenodes <= 0) + { + gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes"); + } + } + +#ifdef GMX_THREAD_MPI + if (SIMMASTER(cr)) + { + /* NOW the threads will be started: */ + hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo, + hw_opt, + inputrec, mtop, + cr, fplog); + if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0) + { + hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; + } + + if (hw_opt->nthreads_tmpi > 1) + { + /* now start the threads. */ + cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, + oenv, bVerbose, bCompact, nstglobalcomm, + ddxyz, dd_node_order, rdd, rconstr, + dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz, + nbpu_opt, + nsteps_cmdline, nstepout, resetstep, nmultisim, + repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce, + cpt_period, max_hours, deviceOptions, + Flags); + /* the main thread continues here with a new cr. We don't deallocate + the old cr because other threads may still be reading it. */ + if (cr == NULL) + { + gmx_comm("Failed to spawn threads"); + } + } + } +#endif + /* END OF CAUTION: cr is now reliable */ + + /* g_membed initialisation * + * Because we change the mtop, init_membed is called before the init_parallel * + * (in case we ever want to make it run in parallel) */ + if (opt2bSet("-membed",nfile,fnm)) + { + if (MASTER(cr)) + { + fprintf(stderr,"Initializing membed"); + } + membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period); + } + + if (PAR(cr)) + { + /* now broadcast everything to the non-master nodes/threads: */ + init_parallel(fplog, cr, inputrec, mtop); + + /* This check needs to happen after get_nthreads_mpi() */ + if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC)) + { + gmx_fatal_collective(FARGS,cr,NULL, + "The Verlet cut-off scheme is not supported with particle decomposition.\n" + "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads."); + } + } + if (fplog != NULL) + { + pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE); + } + +#if defined GMX_THREAD_MPI + /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer + * to the other threads -- slightly uncool, but works fine, just need to + * make sure that the data doesn't get freed twice. */ + if (cr->nnodes > 1) + { + if (!SIMMASTER(cr)) + { + snew(hwinfo, 1); + } + gmx_bcast(sizeof(&hwinfo), &hwinfo, cr); + } +#else + if (PAR(cr) && !SIMMASTER(cr)) + { + /* now we have inputrec on all nodes, can run the detection */ + /* TODO: perhaps it's better to propagate within a node instead? */ + snew(hwinfo, 1); + gmx_detect_hardware(fplog, hwinfo, cr, + bForceUseGPU, bTryUseGPU, hw_opt->gpu_id); + } + + /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */ - check_cpu_affinity_set(fplog, cr, - hw_opt, hwinfo->nthreads_hw_avail, FALSE); ++ gmx_check_thread_affinity_set(fplog, cr, ++ hw_opt, hwinfo->nthreads_hw_avail, FALSE); +#endif + + /* now make sure the state is initialized and propagated */ + set_state_entries(state,inputrec,cr->nnodes); + + /* A parallel command line option consistency check that we can + only do after any threads have started. */ + if (!PAR(cr) && + (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0)) + { + gmx_fatal(FARGS, + "The -dd or -npme option request a parallel simulation, " +#ifndef GMX_MPI + "but %s was compiled without threads or MPI enabled" +#else +#ifdef GMX_THREAD_MPI + "but the number of threads (option -nt) is 1" +#else + "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec" +#endif +#endif + , ShortProgram() + ); + } + + if ((Flags & MD_RERUN) && + (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI)) + { + gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun"); + } + + if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr)) + { + /* All-vs-all loops do not work with domain decomposition */ + Flags |= MD_PARTDEC; + } + + if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC)) + { + if (cr->npmenodes > 0) + { + if (!EEL_PME(inputrec->coulombtype)) + { + gmx_fatal_collective(FARGS,cr,NULL, + "PME nodes are requested, but the system does not use PME electrostatics"); + } + if (Flags & MD_PARTDEC) + { + gmx_fatal_collective(FARGS,cr,NULL, + "PME nodes are requested, but particle decomposition does not support separate PME nodes"); + } + } + + cr->npmenodes = 0; + } + +#ifdef GMX_FAHCORE + fcRegisterSteps(inputrec->nsteps,inputrec->init_step); +#endif + + /* NMR restraints must be initialized before load_checkpoint, + * since with time averaging the history is added to t_state. + * For proper consistency check we therefore need to extend + * t_state here. + * So the PME-only nodes (if present) will also initialize + * the distance restraints. + */ + snew(fcd,1); + + /* This needs to be called before read_checkpoint to extend the state */ + init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state, repl_ex_nst > 0); + + if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0) + { + if (PAR(cr) && !(Flags & MD_PARTDEC)) + { + gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)"); + } + /* Orientation restraints */ + if (MASTER(cr)) + { + init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires), + state); + } + } + + if (DEFORM(*inputrec)) + { + /* Store the deform reference box before reading the checkpoint */ + if (SIMMASTER(cr)) + { + copy_mat(state->box,box); + } + if (PAR(cr)) + { + gmx_bcast(sizeof(box),box,cr); + } + /* Because we do not have the update struct available yet + * in which the reference values should be stored, + * we store them temporarily in static variables. + * This should be thread safe, since they are only written once + * and with identical values. + */ +#ifdef GMX_THREAD_MPI + tMPI_Thread_mutex_lock(&deform_init_box_mutex); +#endif + deform_init_init_step_tpx = inputrec->init_step; + copy_mat(box,deform_init_box_tpx); +#ifdef GMX_THREAD_MPI + tMPI_Thread_mutex_unlock(&deform_init_box_mutex); +#endif + } + + if (opt2bSet("-cpi",nfile,fnm)) + { + /* Check if checkpoint file exists before doing continuation. + * This way we can use identical input options for the first and subsequent runs... + */ + if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) ) + { + load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog, + cr,Flags & MD_PARTDEC,ddxyz, + inputrec,state,&bReadRNG,&bReadEkin, + (Flags & MD_APPENDFILES), + (Flags & MD_APPENDFILESSET)); + + if (bReadRNG) + { + Flags |= MD_READ_RNG; + } + if (bReadEkin) + { + Flags |= MD_READ_EKIN; + } + } + } + + if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES)) +#ifdef GMX_THREAD_MPI + /* With thread MPI only the master node/thread exists in mdrun.c, + * therefore non-master nodes need to open the "seppot" log file here. + */ + || (!MASTER(cr) && (Flags & MD_SEPPOT)) +#endif + ) + { + gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT), + Flags,&fplog); + } + + /* override nsteps with value from cmdline */ + override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr); + + if (SIMMASTER(cr)) + { + copy_mat(state->box,box); + } + + if (PAR(cr)) + { + gmx_bcast(sizeof(box),box,cr); + } + + /* Essential dynamics */ + if (opt2bSet("-ei",nfile,fnm)) + { + /* Open input and output files, allocate space for ED data structure */ + ed = ed_open(mtop->natoms,&state->edsamstate,nfile,fnm,Flags,oenv,cr); + } + + if (PAR(cr) && !((Flags & MD_PARTDEC) || + EI_TPI(inputrec->eI) || + inputrec->eI == eiNM)) + { + cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr, + dddlb_opt,dlb_scale, + ddcsx,ddcsy,ddcsz, + mtop,inputrec, + box,state->x, + &ddbox,&npme_major,&npme_minor); + + make_dd_communicators(fplog,cr,dd_node_order); + + /* Set overallocation to avoid frequent reallocation of arrays */ + set_over_alloc_dd(TRUE); + } + else + { + /* PME, if used, is done on all nodes with 1D decomposition */ + cr->npmenodes = 0; + cr->duty = (DUTY_PP | DUTY_PME); + npme_major = 1; + npme_minor = 1; + if (!EI_TPI(inputrec->eI)) + { + npme_major = cr->nnodes; + } + + if (inputrec->ePBC == epbcSCREW) + { + gmx_fatal(FARGS, + "pbc=%s is only implemented with domain decomposition", + epbc_names[inputrec->ePBC]); + } + } + + if (PAR(cr)) + { + /* After possible communicator splitting in make_dd_communicators. + * we can set up the intra/inter node communication. + */ + gmx_setup_nodecomm(fplog,cr); + } + + /* Initialize per-physical-node MPI process/thread ID and counters. */ + gmx_init_intranode_counters(cr); + +#ifdef GMX_MPI + md_print_info(cr,fplog,"Using %d MPI %s\n", + cr->nnodes, +#ifdef GMX_THREAD_MPI + cr->nnodes==1 ? "thread" : "threads" +#else + cr->nnodes==1 ? "process" : "processes" +#endif + ); + fflush(stderr); +#endif + + gmx_omp_nthreads_init(fplog, cr, + hwinfo->nthreads_hw_avail, + hw_opt->nthreads_omp, + hw_opt->nthreads_omp_pme, + (cr->duty & DUTY_PP) == 0, + inputrec->cutoff_scheme == ecutsVERLET); + + gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU); + + /* getting number of PP/PME threads + PME: env variable should be read only on one node to make sure it is + identical everywhere; + */ + /* TODO nthreads_pp is only used for pinning threads. + * This is a temporary solution until we have a hw topology library. + */ + nthreads_pp = gmx_omp_nthreads_get(emntNonbonded); + nthreads_pme = gmx_omp_nthreads_get(emntPME); + + wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme); + + if (PAR(cr)) + { + /* Master synchronizes its value of reset_counters with all nodes + * including PME only nodes */ + reset_counters = wcycle_get_reset_counters(wcycle); + gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr); + wcycle_set_reset_counters(wcycle, reset_counters); + } + + snew(nrnb,1); + if (cr->duty & DUTY_PP) + { + /* For domain decomposition we allocate dynamically + * in dd_partition_system. + */ + if (DOMAINDECOMP(cr)) + { + bcast_state_setup(cr,state); + } + else + { + if (PAR(cr)) + { + bcast_state(cr,state,TRUE); + } + } + + /* Initiate forcerecord */ + fr = mk_forcerec(); + fr->hwinfo = hwinfo; + init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE, + opt2fn("-table",nfile,fnm), + opt2fn("-tabletf",nfile,fnm), + opt2fn("-tablep",nfile,fnm), + opt2fn("-tableb",nfile,fnm), + nbpu_opt, + FALSE,pforce); + + /* version for PCA_NOT_READ_NODE (see md.c) */ + /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE, + "nofile","nofile","nofile","nofile",FALSE,pforce); + */ + fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT); + + /* Initialize QM-MM */ + if(fr->bQMMM) + { + init_QMMMrec(cr,box,mtop,inputrec,fr); + } + + /* Initialize the mdatoms structure. + * mdatoms is not filled with atom data, + * as this can not be done now with domain decomposition. + */ + mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO); + + /* Initialize the virtual site communication */ + vsite = init_vsite(mtop,cr,FALSE); + + calc_shifts(box,fr->shift_vec); + + /* With periodic molecules the charge groups should be whole at start up + * and the virtual sites should not be far from their proper positions. + */ + if (!inputrec->bContinuation && MASTER(cr) && + !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols)) + { + /* Make molecules whole at start of run */ + if (fr->ePBC != epbcNONE) + { + do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x); + } + if (vsite) + { + /* Correct initial vsite positions are required + * for the initial distribution in the domain decomposition + * and for the initial shell prediction. + */ + construct_vsites_mtop(fplog,vsite,mtop,state->x); + } + } + + if (EEL_PME(fr->eeltype)) + { + ewaldcoeff = fr->ewaldcoeff; + pmedata = &fr->pmedata; + } + else + { + pmedata = NULL; + } + } + else + { + /* This is a PME only node */ + + /* We don't need the state */ + done_state(state); + + ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol); + snew(pmedata,1); + } + - /* Before setting affinity, check whether the affinity has changed - * - which indicates that probably the OpenMP library has changed it since - * we first checked). */ - check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE); ++ if (hw_opt->thread_affinity != threadaffOFF) ++ { ++ /* Before setting affinity, check whether the affinity has changed ++ * - which indicates that probably the OpenMP library has changed it ++ * since we first checked). ++ */ ++ gmx_check_thread_affinity_set(fplog, cr, ++ hw_opt, hwinfo->nthreads_hw_avail, TRUE); + - /* Set the CPU affinity */ - set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec); ++ /* Set the CPU affinity */ ++ gmx_set_thread_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec); ++ } + + /* Initiate PME if necessary, + * either on all nodes or on dedicated PME nodes only. */ + if (EEL_PME(inputrec->coulombtype)) + { + if (mdatoms) + { + nChargePerturbed = mdatoms->nChargePerturbed; + } + if (cr->npmenodes > 0) + { + /* The PME only nodes need to know nChargePerturbed */ + gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr); + } + + if (cr->duty & DUTY_PME) + { + status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec, + mtop ? mtop->natoms : 0,nChargePerturbed, + (Flags & MD_REPRODUCIBLE),nthreads_pme); + if (status != 0) + { + gmx_fatal(FARGS,"Error %d initializing PME",status); + } + } + } + + + if (integrator[inputrec->eI].func == do_md) + { + /* Turn on signal handling on all nodes */ + /* + * (A user signal from the PME nodes (if any) + * is communicated to the PP nodes. + */ + signal_handler_install(); + } + + if (cr->duty & DUTY_PP) + { + if (inputrec->ePull != epullNO) + { + /* Initialize pull code */ + init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda, + EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags); + } + + if (inputrec->bRot) + { + /* Initialize enforced rotation code */ + init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv, + bVerbose,Flags); + } + + constr = init_constraints(fplog,mtop,inputrec,ed,state,cr); + + if (DOMAINDECOMP(cr)) + { + dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec, + Flags & MD_DDBONDCHECK,fr->cginfo_mb); + + set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox); + + setup_dd_grid(fplog,cr->dd); + } + + /* Now do whatever the user wants us to do (how flexible...) */ + integrator[inputrec->eI].func(fplog,cr,nfile,fnm, + oenv,bVerbose,bCompact, + nstglobalcomm, + vsite,constr, + nstepout,inputrec,mtop, + fcd,state, + mdatoms,nrnb,wcycle,ed,fr, + repl_ex_nst,repl_ex_nex,repl_ex_seed, + membed, + cpt_period,max_hours, + deviceOptions, + Flags, + &runtime); + + if (inputrec->ePull != epullNO) + { + finish_pull(fplog,inputrec->pull); + } + + if (inputrec->bRot) + { + finish_rot(fplog,inputrec->rot); + } + + } + else + { + /* do PME only */ + gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec); + } + + if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI)) + { + /* Some timing stats */ + if (SIMMASTER(cr)) + { + if (runtime.proc == 0) + { + runtime.proc = runtime.real; + } + } + else + { + runtime.real = 0; + } + } + + wallcycle_stop(wcycle,ewcRUN); + + /* Finish up, write some stuff + * if rerunMD, don't write last frame again + */ + finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm), + inputrec,nrnb,wcycle,&runtime, + fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ? + nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL, + nthreads_pp, + EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr)); + + if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU) + { + char gpu_err_str[STRLEN]; + + /* free GPU memory and uninitialize GPU (by destroying the context) */ + nbnxn_cuda_free(fplog, fr->nbv->cu_nbv); + + if (!free_gpu(gpu_err_str)) + { + gmx_warning("On node %d failed to free GPU #%d: %s", + cr->nodeid, get_current_gpu_device_id(), gpu_err_str); + } + } + + if (opt2bSet("-membed",nfile,fnm)) + { + sfree(membed); + } + +#ifdef GMX_THREAD_MPI + if (PAR(cr) && SIMMASTER(cr)) +#endif + { + gmx_hardware_info_free(hwinfo); + } + + /* Does what it says */ + print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime); + + /* Close logfile already here if we were appending to it */ + if (MASTER(cr) && (Flags & MD_APPENDFILES)) + { + gmx_log_close(fplog); + } + + rc=(int)gmx_get_stop_condition(); + +#ifdef GMX_THREAD_MPI + /* we need to join all threads. The sub-threads join when they + exit this function, but the master thread needs to be told to + wait for that. */ + if (PAR(cr) && MASTER(cr)) + { + tMPI_Finalize(); + } +#endif + + return rc; +} diff --cc src/tools/levenmar.c index 63ff58fac7,1be1cc5262..0d96233b89 --- a/src/tools/levenmar.c +++ b/src/tools/levenmar.c @@@ -206,65 -254,106 +206,65 @@@ static void dump_mat(int n,real **a gmx_bool gaussj(real **a, int n, real **b, int m) { - int *indxc, *indxr, *ipiv; - int i, icol = 0, irow = 0, j, k, l, ll; - real big, dum, pivinv; - - indxc = ivector(1, n); - indxr = ivector(1, n); - ipiv = ivector(1, n); - for (j = 1; j <= n; j++) - { - ipiv[j] = 0; - } - for (i = 1; i <= n; i++) - { - big = 0.0; - for (j = 1; j <= n; j++) - { - if (ipiv[j] != 1) - { - for (k = 1; k <= n; k++) - { - if (ipiv[k] == 0) - { - if (fabs(a[j][k]) >= big) - { - big = fabs(a[j][k]); - irow = j; - icol = k; - } - } - else if (ipiv[k] > 1) - { - nrerror("GAUSSJ: Singular Matrix-1", FALSE); - return FALSE; - } - } - } - } - ++(ipiv[icol]); - if (irow != icol) - { - for (l = 1; l <= n; l++) - { - SWAP(a[irow][l], a[icol][l]); - } - for (l = 1; l <= m; l++) - { - SWAP(b[irow][l], b[icol][l]); - } - } - indxr[i] = irow; - indxc[i] = icol; - if (a[icol][icol] == 0.0) - { - fprintf(stderr, "irow = %d, icol = %d\n", irow, icol); - dump_mat(n, a); - nrerror("GAUSSJ: Singular Matrix-2", FALSE); - return FALSE; - } - pivinv = 1.0/a[icol][icol]; - a[icol][icol] = 1.0; - for (l = 1; l <= n; l++) - { - a[icol][l] *= pivinv; - } - for (l = 1; l <= m; l++) - { - b[icol][l] *= pivinv; - } - for (ll = 1; ll <= n; ll++) - { - if (ll != icol) - { - dum = a[ll][icol]; - a[ll][icol] = 0.0; - for (l = 1; l <= n; l++) - { - a[ll][l] -= a[icol][l]*dum; - } - for (l = 1; l <= m; l++) - { - b[ll][l] -= b[icol][l]*dum; - } - } - } - } - for (l = n; l >= 1; l--) - { - if (indxr[l] != indxc[l]) - { - for (k = 1; k <= n; k++) - { - SWAP(a[k][indxr[l]], a[k][indxc[l]]); - } - } - } - free_ivector(ipiv, 1); - free_ivector(indxr, 1); - free_ivector(indxc, 1); - - return TRUE; + int *indxc,*indxr,*ipiv; + int i,icol=0,irow=0,j,k,l,ll; + real big,dum,pivinv; + + indxc=ivector(1,n); + indxr=ivector(1,n); + ipiv=ivector(1,n); + for (j=1;j<=n;j++) ipiv[j]=0; + for (i=1;i<=n;i++) { + big=0.0; + for (j=1;j<=n;j++) + if (ipiv[j] != 1) + for (k=1;k<=n;k++) { + if (ipiv[k] == 0) { + if (fabs(a[j][k]) >= big) { + big=fabs(a[j][k]); + irow=j; + icol=k; + } + } else if (ipiv[k] > 1) { + nrerror("GAUSSJ: Singular Matrix-1", FALSE); + return FALSE; + } + } + ++(ipiv[icol]); + if (irow != icol) { - for (l=1;l<=n;l++) SWAP(a[irow][l],a[icol][l]) - for (l=1;l<=m;l++) SWAP(b[irow][l],b[icol][l]) ++ for (l=1;l<=n;l++) SWAP(a[irow][l],a[icol][l]); ++ for (l=1;l<=m;l++) SWAP(b[irow][l],b[icol][l]); + } + indxr[i]=irow; + indxc[i]=icol; + if (a[icol][icol] == 0.0) { + fprintf(stderr,"irow = %d, icol = %d\n",irow,icol); + dump_mat(n,a); + nrerror("GAUSSJ: Singular Matrix-2", FALSE); + return FALSE; + } + pivinv=1.0/a[icol][icol]; + a[icol][icol]=1.0; + for (l=1;l<=n;l++) a[icol][l] *= pivinv; + for (l=1;l<=m;l++) b[icol][l] *= pivinv; + for (ll=1;ll<=n;ll++) + if (ll != icol) { + dum=a[ll][icol]; + a[ll][icol]=0.0; + for (l=1;l<=n;l++) a[ll][l] -= a[icol][l]*dum; + for (l=1;l<=m;l++) b[ll][l] -= b[icol][l]*dum; + } + } + for (l=n;l>=1;l--) { + if (indxr[l] != indxc[l]) + for (k=1;k<=n;k++) + SWAP(a[k][indxr[l]],a[k][indxc[l]]); + } + free_ivector(ipiv,1); + free_ivector(indxr,1); + free_ivector(indxc,1); + + return TRUE; } #undef SWAP @@@ -272,152 -361,231 +272,152 @@@ static void covsrt(real **covar, int ma, int lista[], int mfit) { - int i, j; - real swap; - - for (j = 1; j < ma; j++) - { - for (i = j+1; i <= ma; i++) - { - covar[i][j] = 0.0; - } - } - for (i = 1; i < mfit; i++) - { - for (j = i+1; j <= mfit; j++) - { - if (lista[j] > lista[i]) - { - covar[lista[j]][lista[i]] = covar[i][j]; - } - else - { - covar[lista[i]][lista[j]] = covar[i][j]; - } - } - } - swap = covar[1][1]; - for (j = 1; j <= ma; j++) - { - covar[1][j] = covar[j][j]; - covar[j][j] = 0.0; - } - covar[lista[1]][lista[1]] = swap; - for (j = 2; j <= mfit; j++) - { - covar[lista[j]][lista[j]] = covar[1][j]; - } - for (j = 2; j <= ma; j++) - { - for (i = 1; i <= j-1; i++) - { - covar[i][j] = covar[j][i]; - } - } + int i,j; + real swap; + + for (j=1;j lista[i]) + covar[lista[j]][lista[i]]=covar[i][j]; + else + covar[lista[i]][lista[j]]=covar[i][j]; + } + swap=covar[1][1]; + for (j=1;j<=ma;j++) { + covar[1][j]=covar[j][j]; + covar[j][j]=0.0; + } + covar[lista[1]][lista[1]]=swap; + for (j=2;j<=mfit;j++) covar[lista[j]][lista[j]]=covar[1][j]; + for (j=2;j<=ma;j++) + for (i=1;i<=j-1;i++) covar[i][j]=covar[j][i]; } -#define SWAP(a, b) {swap = (a); (a) = (b); (b) = swap; } +#define SWAP(a,b) {swap=(a);(a)=(b);(b)=swap;} -static void covsrt_new(real **covar, int ma, int ia[], int mfit) -/* Expand in storage the covariance matrix covar, so as to take - * into account parameters that are being held fixed. (For the - * latter, return zero covariances.) - */ +static void covsrt_new(real **covar,int ma, int ia[], int mfit) + /* Expand in storage the covariance matrix covar, so as to take + * into account parameters that are being held fixed. (For the + * latter, return zero covariances.) + */ { - int i, j, k; - real swap; - for (i = mfit+1; i <= ma; i++) - { - for (j = 1; j <= i; j++) - { - covar[i][j] = covar[j][i] = 0.0; - } - } - k = mfit; - for (j = ma; j >= 1; j--) - { - if (ia[j]) - { - for (i = 1; i <= ma; i++) - { - SWAP(covar[i][k], covar[i][j]); - } - for (i = 1; i <= ma; i++) - { - SWAP(covar[k][i], covar[j][i]); - } - k--; - } - } + int i,j,k; + real swap; + for (i=mfit+1;i<=ma;i++) + for (j=1;j<=i;j++) covar[i][j]=covar[j][i]=0.0; + k=mfit; + for (j=ma;j>=1;j--) { + if (ia[j]) { - for (i=1;i<=ma;i++) SWAP(covar[i][k],covar[i][j]) - for (i=1;i<=ma;i++) SWAP(covar[k][i],covar[j][i]) ++ for (i=1;i<=ma;i++) SWAP(covar[i][k],covar[i][j]); ++ for (i=1;i<=ma;i++) SWAP(covar[k][i],covar[j][i]); + k--; + } + } } #undef SWAP - -static void mrqcof(real x[], real y[], real sig[], int ndata, real a[], - int ma, int lista[], int mfit, - real **alpha, real beta[], real *chisq, - void (*funcs)(real, real *, real *, real *)) + +static void mrqcof(real x[], real y[], real sig[], int ndata, real a[], + int ma, int lista[], int mfit, + real **alpha, real beta[], real *chisq, + void (*funcs)(real,real *,real *,real *)) { - int k, j, i; - real ymod, wt, sig2i, dy, *dyda; - - dyda = rvector(1, ma); - for (j = 1; j <= mfit; j++) - { - for (k = 1; k <= j; k++) - { - alpha[j][k] = 0.0; - } - beta[j] = 0.0; - } - *chisq = 0.0; - for (i = 1; i <= ndata; i++) - { - (*funcs)(x[i], a, &ymod, dyda); - sig2i = 1.0/(sig[i]*sig[i]); - dy = y[i]-ymod; - for (j = 1; j <= mfit; j++) - { - wt = dyda[lista[j]]*sig2i; - for (k = 1; k <= j; k++) - { - alpha[j][k] += wt*dyda[lista[k]]; - } - beta[j] += dy*wt; - } - (*chisq) += dy*dy*sig2i; - } - for (j = 2; j <= mfit; j++) - { - for (k = 1; k <= j-1; k++) - { - alpha[k][j] = alpha[j][k]; - } - } - free_vector(dyda, 1); + int k,j,i; + real ymod,wt,sig2i,dy,*dyda; + + dyda=rvector(1,ma); + for (j=1;j<=mfit;j++) { + for (k=1;k<=j;k++) alpha[j][k]=0.0; + beta[j]=0.0; + } + *chisq=0.0; + for (i=1;i<=ndata;i++) { + (*funcs)(x[i],a,&ymod,dyda); + sig2i=1.0/(sig[i]*sig[i]); + dy=y[i]-ymod; + for (j=1;j<=mfit;j++) { + wt=dyda[lista[j]]*sig2i; + for (k=1;k<=j;k++) + alpha[j][k] += wt*dyda[lista[k]]; + beta[j] += dy*wt; + } + (*chisq) += dy*dy*sig2i; + } + for (j=2;j<=mfit;j++) + for (k=1;k<=j-1;k++) alpha[k][j]=alpha[j][k]; + free_vector(dyda,1); } - -gmx_bool mrqmin(real x[], real y[], real sig[], int ndata, real a[], - int ma, int lista[], int mfit, - real **covar, real **alpha, real *chisq, - void (*funcs)(real, real *, real *, real *), - real *alamda) + +gmx_bool mrqmin(real x[], real y[], real sig[], int ndata, real a[], + int ma, int lista[], int mfit, + real **covar, real **alpha, real *chisq, + void (*funcs)(real,real *,real *,real *), + real *alamda) { - int k, kk, j, ihit; - static real *da, *atry, **oneda, *beta, ochisq; - - if (*alamda < 0.0) - { - oneda = matrix1(1, mfit, 1, 1); - atry = rvector(1, ma); - da = rvector(1, ma); - beta = rvector(1, ma); - kk = mfit+1; - for (j = 1; j <= ma; j++) - { - ihit = 0; - for (k = 1; k <= mfit; k++) - { - if (lista[k] == j) - { - ihit++; - } - } - if (ihit == 0) - { - lista[kk++] = j; - } - else if (ihit > 1) - { - nrerror("Bad LISTA permutation in MRQMIN-1", FALSE); - return FALSE; - } - } - if (kk != ma+1) - { - nrerror("Bad LISTA permutation in MRQMIN-2", FALSE); - return FALSE; - } - *alamda = 0.001; - mrqcof(x, y, sig, ndata, a, ma, lista, mfit, alpha, beta, chisq, funcs); - ochisq = (*chisq); - } - for (j = 1; j <= mfit; j++) - { - for (k = 1; k <= mfit; k++) - { - covar[j][k] = alpha[j][k]; - } - covar[j][j] = alpha[j][j]*(1.0+(*alamda)); - oneda[j][1] = beta[j]; - } - if (!gaussj(covar, mfit, oneda, 1)) - { - return FALSE; - } - for (j = 1; j <= mfit; j++) - { - da[j] = oneda[j][1]; - } - if (*alamda == 0.0) - { - covsrt(covar, ma, lista, mfit); - free_vector(beta, 1); - free_vector(da, 1); - free_vector(atry, 1); - free_matrix(oneda, 1, mfit, 1); - return TRUE; - } - for (j = 1; j <= ma; j++) - { - atry[j] = a[j]; - } - for (j = 1; j <= mfit; j++) - { - atry[lista[j]] = a[lista[j]]+da[j]; - } - mrqcof(x, y, sig, ndata, atry, ma, lista, mfit, covar, da, chisq, funcs); - if (*chisq < ochisq) - { - *alamda *= 0.1; - ochisq = (*chisq); - for (j = 1; j <= mfit; j++) - { - for (k = 1; k <= mfit; k++) - { - alpha[j][k] = covar[j][k]; - } - beta[j] = da[j]; - a[lista[j]] = atry[lista[j]]; - } - } - else - { - *alamda *= 10.0; - *chisq = ochisq; - } + int k,kk,j,ihit; + static real *da,*atry,**oneda,*beta,ochisq; + + if (*alamda < 0.0) { + oneda=matrix1(1,mfit,1,1); + atry=rvector(1,ma); + da=rvector(1,ma); + beta=rvector(1,ma); + kk=mfit+1; + for (j=1;j<=ma;j++) { + ihit=0; + for (k=1;k<=mfit;k++) + if (lista[k] == j) ihit++; + if (ihit == 0) + lista[kk++]=j; + else if (ihit > 1) { + nrerror("Bad LISTA permutation in MRQMIN-1", FALSE); + return FALSE; + } + } + if (kk != ma+1) { + nrerror("Bad LISTA permutation in MRQMIN-2", FALSE); + return FALSE; + } + *alamda=0.001; + mrqcof(x,y,sig,ndata,a,ma,lista,mfit,alpha,beta,chisq,funcs); + ochisq=(*chisq); + } + for (j=1;j<=mfit;j++) { + for (k=1;k<=mfit;k++) covar[j][k]=alpha[j][k]; + covar[j][j]=alpha[j][j]*(1.0+(*alamda)); + oneda[j][1]=beta[j]; + } + if (!gaussj(covar,mfit,oneda,1)) + return FALSE; + for (j=1;j<=mfit;j++) + da[j]=oneda[j][1]; + if (*alamda == 0.0) { + covsrt(covar,ma,lista,mfit); + free_vector(beta,1); + free_vector(da,1); + free_vector(atry,1); + free_matrix(oneda,1,mfit,1); return TRUE; + } + for (j=1;j<=ma;j++) atry[j]=a[j]; + for (j=1;j<=mfit;j++) + atry[lista[j]] = a[lista[j]]+da[j]; + mrqcof(x,y,sig,ndata,atry,ma,lista,mfit,covar,da,chisq,funcs); + if (*chisq < ochisq) { + *alamda *= 0.1; + ochisq=(*chisq); + for (j=1;j<=mfit;j++) { + for (k=1;k<=mfit;k++) alpha[j][k]=covar[j][k]; + beta[j]=da[j]; + a[lista[j]]=atry[lista[j]]; + } + } else { + *alamda *= 10.0; + *chisq=ochisq; + } + return TRUE; }