2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
60 #include "eigensolver.h"
67 /* print to two file pointers at once (i.e. stderr and log) */
69 void lo_ffprintf(FILE *fp1, FILE *fp2, const char *buf)
71 fprintf(fp1, "%s", buf);
72 fprintf(fp2, "%s", buf);
75 /* just print a prepared buffer to fp1 and fp2 */
77 void ffprintf(FILE *fp1, FILE *fp2, const char *buf)
79 lo_ffprintf(fp1, fp2, buf);
82 /* prepare buffer with one argument, then print to fp1 and fp2 */
84 void ffprintf_d(FILE *fp1, FILE *fp2, char *buf, const char *fmt, int arg)
86 sprintf(buf, fmt, arg);
87 lo_ffprintf(fp1, fp2, buf);
90 /* prepare buffer with one argument, then print to fp1 and fp2 */
92 void ffprintf_g(FILE *fp1, FILE *fp2, char *buf, const char *fmt, real arg)
94 sprintf(buf, fmt, arg);
95 lo_ffprintf(fp1, fp2, buf);
98 /* prepare buffer with one argument, then print to fp1 and fp2 */
100 void ffprintf_s(FILE *fp1, FILE *fp2, char *buf, const char *fmt, const char *arg)
102 sprintf(buf, fmt, arg);
103 lo_ffprintf(fp1, fp2, buf);
106 /* prepare buffer with two arguments, then print to fp1 and fp2 */
108 void ffprintf_dd(FILE *fp1, FILE *fp2, char *buf, const char *fmt, int arg1, int arg2)
110 sprintf(buf, fmt, arg1, arg2);
111 lo_ffprintf(fp1, fp2, buf);
114 /* prepare buffer with two arguments, then print to fp1 and fp2 */
116 void ffprintf_gg(FILE *fp1, FILE *fp2, char *buf, const char *fmt, real arg1, real arg2)
118 sprintf(buf, fmt, arg1, arg2);
119 lo_ffprintf(fp1, fp2, buf);
122 /* prepare buffer with two arguments, then print to fp1 and fp2 */
124 void ffprintf_ss(FILE *fp1, FILE *fp2, char *buf, const char *fmt, const char *arg1, const char *arg2)
126 sprintf(buf, fmt, arg1, arg2);
127 lo_ffprintf(fp1, fp2, buf);
140 void pr_energy(FILE *fp,real e)
142 fprintf(fp,"Energy: %8.4f\n",e);
145 void cp_index(int nn,int from[],int to[])
149 for(i=0; (i<nn); i++)
153 void mc_optimize(FILE *log,t_mat *m,int maxiter,int *seed,real kT)
155 real e[2],ei,ej,efac;
159 int i,isw,jsw,iisw,jjsw,nn;
161 fprintf(stderr,"\nDoing Monte Carlo clustering\n");
164 cp_index(nn,m->m_ind,low_index);
165 if (getenv("TESTMC")) {
166 e[cur] = mat_energy(m);
167 pr_energy(log,e[cur]);
168 fprintf(log,"Doing 1000 random swaps\n");
169 for(i=0; (i<1000); i++) {
171 isw = nn*rando(seed);
172 jsw = nn*rando(seed);
173 } while ((isw == jsw) || (isw >= nn) || (jsw >= nn));
174 iisw = m->m_ind[isw];
175 jjsw = m->m_ind[jsw];
176 m->m_ind[isw] = jjsw;
177 m->m_ind[jsw] = iisw;
180 e[cur] = mat_energy(m);
181 pr_energy(log,e[cur]);
182 for(i=0; (i<maxiter); i++) {
184 isw = nn*rando(seed);
185 jsw = nn*rando(seed);
186 } while ((isw == jsw) || (isw >= nn) || (jsw >= nn));
188 iisw = m->m_ind[isw];
189 jjsw = m->m_ind[jsw];
190 ei = row_energy(nn,iisw,m->mat[jsw]);
191 ej = row_energy(nn,jjsw,m->mat[isw]);
193 e[next] = e[cur] + (ei+ej-EROW(m,isw)-EROW(m,jsw))/nn;
195 efac = kT ? exp((e[next]-e[cur])/kT) : -1;
196 if ((e[next] > e[cur]) || (efac > rando(seed))) {
198 if (e[next] > e[cur])
199 cp_index(nn,m->m_ind,low_index);
201 fprintf(log,"Taking uphill step\n");
203 /* Now swapping rows */
204 m->m_ind[isw] = jjsw;
205 m->m_ind[jsw] = iisw;
209 fprintf(log,"Iter: %d Swapped %4d and %4d (now %g)",
210 i,isw,jsw,mat_energy(m));
211 pr_energy(log,e[cur]);
214 /* Now restore the highest energy index */
215 cp_index(nn,low_index,m->m_ind);
218 static void calc_dist(int nind,rvec x[],real **d)
224 for(i=0; (i<nind-1); i++) {
226 for(j=i+1; (j<nind); j++) {
227 /* Should use pbc_dx when analysing multiple molecueles,
228 * but the box is not stored for every frame.
230 rvec_sub(xi,x[j],dx);
236 static real rms_dist(int isize,real **d,real **d_r)
242 for(i=0; (i<isize-1); i++)
243 for(j=i+1; (j<isize); j++) {
247 r2/=(isize*(isize-1))/2;
252 static int rms_dist_comp(const void *a,const void *b)
259 if (da->dist - db->dist < 0)
261 else if (da->dist - db->dist > 0)
266 static int clust_id_comp(const void *a,const void *b)
273 return da->clust - db->clust;
276 static int nrnb_comp(const void *a, const void *b)
283 /* return the b-a, we want highest first */
284 return db->nr - da->nr;
287 void gather(t_mat *m,real cutoff,t_clusters *clust)
291 int i,j,k,nn,cid,n1,diff;
294 /* First we sort the entries in the RMSD matrix */
298 for(i=k=0; (i<n1); i++)
299 for(j=i+1; (j<n1); j++,k++) {
302 d[k].dist = m->mat[i][j];
305 gmx_incons("gather algortihm");
306 qsort(d,nn,sizeof(d[0]),rms_dist_comp);
308 /* Now we make a cluster index for all of the conformations */
311 /* Now we check the closest structures, and equalize their cluster numbers */
312 fprintf(stderr,"Linking structures ");
316 for(k=0; (k<nn) && (d[k].dist < cutoff); k++) {
317 diff = c[d[k].j].clust - c[d[k].i].clust;
321 c[d[k].j].clust = c[d[k].i].clust;
323 c[d[k].i].clust = c[d[k].j].clust;
327 fprintf(stderr,"\nSorting and renumbering clusters\n");
328 /* Sort on cluster number */
329 qsort(c,n1,sizeof(c[0]),clust_id_comp);
331 /* Renumber clusters */
333 for(k=1; k<n1; k++) {
334 if (c[k].clust != c[k-1].clust) {
342 for(k=0; (k<n1); k++)
343 fprintf(debug,"Cluster index for conformation %d: %d\n",
344 c[k].conf,c[k].clust);
347 clust->cl[c[k].conf] = c[k].clust;
353 gmx_bool jp_same(int **nnb,int i,int j,int P)
359 for(k=0; nnb[i][k]>=0; k++)
360 bIn = bIn || (nnb[i][k] == j);
365 for(k=0; nnb[j][k]>=0; k++)
366 bIn = bIn || (nnb[j][k] == i);
371 for(ii=0; nnb[i][ii]>=0; ii++)
372 for(jj=0; nnb[j][jj]>=0; jj++)
373 if ((nnb[i][ii] == nnb[j][jj]) && (nnb[i][ii] != -1))
379 static void jarvis_patrick(int n1,real **mat,int M,int P,
380 real rmsdcut,t_clusters *clust)
385 int i,j,k,cid,diff,max;
392 /* First we sort the entries in the RMSD matrix row by row.
393 * This gives us the nearest neighbor list.
397 for(i=0; (i<n1); i++) {
398 for(j=0; (j<n1); j++) {
400 row[j].dist = mat[i][j];
402 qsort(row,n1,sizeof(row[0]),rms_dist_comp);
404 /* Put the M nearest neighbors in the list */
406 for(j=k=0; (k<M) && (j<n1) && (mat[i][row[j].j] < rmsdcut); j++)
408 nnb[i][k] = row[j].j;
413 /* Put all neighbors nearer than rmsdcut in the list */
416 for(j=0; (j<n1) && (mat[i][row[j].j] < rmsdcut); j++)
422 nnb[i][k] = row[j].j;
426 srenew(nnb[i],max+1);
432 fprintf(debug,"Nearest neighborlist. M = %d, P = %d\n",M,P);
433 for(i=0; (i<n1); i++) {
434 fprintf(debug,"i:%5d nbs:",i);
435 for(j=0; nnb[i][j]>=0; j++)
436 fprintf(debug,"%5d[%5.3f]",nnb[i][j],mat[i][nnb[i][j]]);
442 fprintf(stderr,"Linking structures ");
443 /* Use mcpy for temporary storage of booleans */
444 mcpy = mk_matrix(n1,n1,FALSE);
446 for(j=i+1; j<n1; j++)
447 mcpy[i][j] = jp_same(nnb,i,j,P);
451 for(i=0; i<n1; i++) {
452 for(j=i+1; j<n1; j++)
454 diff = c[j].clust - c[i].clust;
458 c[j].clust = c[i].clust;
460 c[i].clust = c[j].clust;
466 fprintf(stderr,"\nSorting and renumbering clusters\n");
467 /* Sort on cluster number */
468 qsort(c,n1,sizeof(c[0]),clust_id_comp);
470 /* Renumber clusters */
472 for(k=1; k<n1; k++) {
473 if (c[k].clust != c[k-1].clust) {
482 clust->cl[c[k].conf] = c[k].clust;
484 for(k=0; (k<n1); k++)
485 fprintf(debug,"Cluster index for conformation %d: %d\n",
486 c[k].conf,c[k].clust);
488 /* Again, I don't see the point in this... (AF) */
489 /* for(i=0; (i<n1); i++) { */
490 /* for(j=0; (j<n1); j++) */
491 /* mcpy[c[i].conf][c[j].conf] = mat[i][j]; */
493 /* for(i=0; (i<n1); i++) { */
494 /* for(j=0; (j<n1); j++) */
495 /* mat[i][j] = mcpy[i][j]; */
497 done_matrix(n1,&mcpy);
500 for(i=0; (i<n1); i++)
505 static void dump_nnb (FILE *fp, const char *title, int n1, t_nnb *nnb)
509 /* dump neighbor list */
510 fprintf(fp,"%s",title);
511 for(i=0; (i<n1); i++) {
512 fprintf(fp,"i:%5d #:%5d nbs:",i,nnb[i].nr);
513 for(j=0; j<nnb[i].nr; j++)
514 fprintf(fp,"%5d",nnb[i].nb[j]);
519 static void gromos(int n1, real **mat, real rmsdcut, t_clusters *clust)
525 /* Put all neighbors nearer than rmsdcut in the list */
526 fprintf(stderr,"Making list of neighbors within cutoff ");
529 for(i=0; (i<n1); i++) {
532 /* put all neighbors within cut-off in list */
534 if (mat[i][j] < rmsdcut) {
537 srenew(nnb[i].nb,max);
542 /* store nr of neighbors, we'll need that */
544 if (i%(1+n1/100)==0) fprintf(stderr,"%3d%%\b\b\b\b",(i*100+1)/n1);
546 fprintf(stderr,"%3d%%\n",100);
549 /* sort neighbor list on number of neighbors, largest first */
550 qsort(nnb,n1,sizeof(nnb[0]),nrnb_comp);
552 if (debug) dump_nnb(debug, "Nearest neighborlist after sort.\n", n1, nnb);
554 /* turn first structure with all its neighbors (largest) into cluster
555 remove them from pool of structures and repeat for all remaining */
556 fprintf(stderr,"Finding clusters %4d", 0);
557 /* cluster id's start at 1: */
560 /* set cluster id (k) for first item in neighborlist */
561 for (j=0; j<nnb[0].nr; j++)
562 clust->cl[nnb[0].nb[j]] = k;
567 /* adjust number of neighbors for others, taking removals into account: */
568 for(i=1; i<n1 && nnb[i].nr; i++) {
570 for(j=0; j<nnb[i].nr; j++)
571 /* if this neighbor wasn't removed */
572 if ( clust->cl[nnb[i].nb[j]] == 0 ) {
573 /* shift the rest (j1<=j) */
574 nnb[i].nb[j1]=nnb[i].nb[j];
578 /* now j1 is the new number of neighbors */
581 /* sort again on nnb[].nr, because we have new # neighbors: */
582 /* but we only need to sort upto i, i.e. when nnb[].nr>0 */
583 qsort(nnb,i,sizeof(nnb[0]),nrnb_comp);
585 fprintf(stderr,"\b\b\b\b%4d",k);
589 fprintf(stderr,"\n");
592 fprintf(debug,"Clusters (%d):\n", k);
594 fprintf(debug," %3d", clust->cl[i]);
601 rvec **read_whole_trj(const char *fn,int isize,atom_id index[],int skip,
602 int *nframe, real **time,const output_env_t oenv,gmx_bool bPBC, gmx_rmpbc_t gpbc)
615 natom = read_first_x(oenv,&status,fn,&t,&x,box);
620 gmx_rmpbc(gpbc,natom,box,x);
625 srenew(*time,max_nf);
627 if ((i % skip) == 0) {
629 /* Store only the interesting atoms */
630 for(j=0; (j<isize); j++)
631 copy_rvec(x[index[j]],xx[i0][j]);
636 } while (read_next_x(oenv,status,&t,natom,x,box));
637 fprintf(stderr,"Allocated %lu bytes for frames\n",
638 (unsigned long) (max_nf*isize*sizeof(**xx)));
639 fprintf(stderr,"Read %d frames from trajectory %s\n",i0,fn);
646 static int plot_clusters(int nf, real **mat, t_clusters *clust,
647 int nlevels, int minstruct)
650 int *cl_id,*nstruct,*strind;
655 for(i=0; i<nf; i++) {
657 cl_id[i] = clust->cl[i];
661 for(i=0; i<nf; i++) {
662 if (nstruct[i] >= minstruct) {
664 for(j=0; (j<nf); j++)
666 strind[j] = ncluster;
670 fprintf(stderr,"There are %d clusters with at least %d conformations\n",
673 for(i=0; (i<nf); i++) {
676 if ((ci == cl_id[j]) && (nstruct[ci] >= minstruct)) {
677 /* color different clusters with different colors, as long as
678 we don't run out of colors */
679 mat[i][j] = strind[i];
691 static void mark_clusters(int nf, real **mat, real val, t_clusters *clust)
697 if (clust->cl[i] == clust->cl[j])
703 static char *parse_filename(const char *fn, int maxnr)
711 gmx_fatal(FARGS,"will not number filename %s containing '%c'",fn,'%');
712 /* number of digits needed in numbering */
713 i = (int)(log(maxnr)/log(10)) + 1;
714 /* split fn and ext */
715 ext = strrchr(fn, '.');
717 gmx_fatal(FARGS,"cannot separate extension in filename %s",fn);
719 /* insert e.g. '%03d' between fn and ext */
720 sprintf(buf,"%s%%0%dd.%s",fn,i,ext);
721 snew(fnout,strlen(buf)+1);
727 static void ana_trans(t_clusters *clust, int nf,
728 const char *transfn, const char *ntransfn, FILE *log,
729 t_rgb rlo,t_rgb rhi,const output_env_t oenv)
734 int i,ntranst,maxtrans;
737 snew(ntrans,clust->ncl);
738 snew(trans,clust->ncl);
739 snew(axis,clust->ncl);
740 for(i=0; i<clust->ncl; i++) {
742 snew(trans[i],clust->ncl);
747 if(clust->cl[i] != clust->cl[i-1]) {
749 ntrans[clust->cl[i-1]-1]++;
750 ntrans[clust->cl[i]-1]++;
751 trans[clust->cl[i-1]-1][clust->cl[i]-1]++;
752 maxtrans = max(maxtrans, trans[clust->cl[i]-1][clust->cl[i-1]-1]);
754 ffprintf_dd(stderr,log,buf,"Counted %d transitions in total, "
755 "max %d between two specific clusters\n",ntranst,maxtrans);
757 fp=ffopen(transfn,"w");
758 i = min(maxtrans+1, 80);
759 write_xpm(fp,0,"Cluster Transitions","# transitions",
760 "from cluster","to cluster",
761 clust->ncl, clust->ncl, axis, axis, trans,
762 0, maxtrans, rlo, rhi, &i);
766 fp=xvgropen(ntransfn,"Cluster Transitions","Cluster #","# transitions",
768 for(i=0; i<clust->ncl; i++)
769 fprintf(fp,"%5d %5d\n",i+1,ntrans[i]);
773 for(i=0; i<clust->ncl; i++)
779 static void analyze_clusters(int nf, t_clusters *clust, real **rmsd,
780 int natom, t_atoms *atoms, rvec *xtps,
781 real *mass, rvec **xx, real *time,
782 int ifsize, atom_id *fitidx,
783 int iosize, atom_id *outidx,
784 const char *trxfn, const char *sizefn,
785 const char *transfn, const char *ntransfn,
786 const char *clustidfn, gmx_bool bAverage,
787 int write_ncl, int write_nst, real rmsmin,
788 gmx_bool bFit, FILE *log,t_rgb rlo,t_rgb rhi,
789 const output_env_t oenv)
792 char buf[STRLEN],buf1[40],buf2[40],buf3[40],*trxsfn;
793 t_trxstatus *trxout=NULL;
794 t_trxstatus *trxsout=NULL;
795 int i,i1,cl,nstr,*structure,first=0,midstr;
796 gmx_bool *bWrite=NULL;
797 real r,clrmsd,midrmsd;
803 ffprintf_d(stderr,log,buf,"\nFound %d clusters\n\n",clust->ncl);
806 /* do we write all structures? */
808 trxsfn = parse_filename(trxfn, max(write_ncl,clust->ncl));
811 ffprintf_ss(stderr,log,buf,"Writing %s structure for each cluster to %s\n",
812 bAverage ? "average" : "middle", trxfn);
814 /* find out what we want to tell the user:
815 Writing [all structures|structures with rmsd > %g] for
816 {all|first %d} clusters {with more than %d structures} to %s */
818 sprintf(buf1,"structures with rmsd > %g",rmsmin);
820 sprintf(buf1,"all structures");
821 buf2[0]=buf3[0]='\0';
822 if (write_ncl>=clust->ncl) {
824 sprintf(buf2,"all ");
826 sprintf(buf2,"the first %d ",write_ncl);
828 sprintf(buf3," with more than %d structures",write_nst);
829 sprintf(buf,"Writing %s for %sclusters%s to %s\n",buf1,buf2,buf3,trxsfn);
830 ffprintf(stderr,log,buf);
833 /* Prepare a reference structure for the orientation of the clusters */
835 reset_x(ifsize,fitidx,natom,NULL,xtps,mass);
836 trxout = open_trx(trxfn,"w");
837 /* Calculate the average structure in each cluster, *
838 * all structures are fitted to the first struture of the cluster */
842 if (transfn || ntransfn)
843 ana_trans(clust, nf, transfn, ntransfn, log,rlo,rhi,oenv);
846 fp=xvgropen(clustidfn,"Clusters",output_env_get_xvgr_tlabel(oenv),"Cluster #",oenv);
847 fprintf(fp,"@ s0 symbol 2\n");
848 fprintf(fp,"@ s0 symbol size 0.2\n");
849 fprintf(fp,"@ s0 linestyle 0\n");
851 fprintf(fp,"%8g %8d\n",time[i],clust->cl[i]);
855 fp=xvgropen(sizefn,"Cluster Sizes","Cluster #","# Structures",oenv);
856 fprintf(fp,"@g%d type %s\n",0,"bar");
859 fprintf(log,"\n%3s | %3s %4s | %6s %4s | cluster members\n",
860 "cl.","#st","rmsd","middle","rmsd");
861 for(cl=1; cl<=clust->ncl; cl++) {
862 /* prepare structures (fit, middle, average) */
864 for(i=0; i<natom;i++)
867 for(i1=0; i1<nf; i1++)
868 if (clust->cl[i1] == cl) {
869 structure[nstr] = i1;
871 if (trxfn && (bAverage || write_ncl) ) {
873 reset_x(ifsize,fitidx,natom,NULL,xx[i1],mass);
877 do_fit(natom,mass,xx[first],xx[i1]);
879 for(i=0; i<natom; i++)
880 rvec_inc(xav[i],xx[i1][i]);
884 fprintf(fp,"%8d %8d\n",cl,nstr);
888 for(i1=0; i1<nstr; i1++) {
891 for(i=0; i<nstr; i++)
893 r += rmsd[structure[i]][structure[i1]];
895 r += rmsd[structure[i1]][structure[i]];
899 midstr = structure[i1];
906 /* dump cluster info to logfile */
908 sprintf(buf1,"%6.3f",clrmsd);
911 sprintf(buf2,"%5.3f",midrmsd);
915 sprintf(buf1,"%5s","");
916 sprintf(buf2,"%5s","");
918 fprintf(log,"%3d | %3d %s | %6g%s |",cl,nstr,buf1,time[midstr],buf2);
919 for(i=0; i<nstr; i++) {
920 if ((i % 7 == 0) && i)
921 sprintf(buf,"\n%3s | %3s %4s | %6s %4s |","","","","","");
925 fprintf(log,"%s %6g",buf,time[i1]);
929 /* write structures to trajectory file(s) */
932 for(i=0; i<nstr; i++)
934 if ( cl < write_ncl+1 && nstr > write_nst ) {
935 /* Dump all structures for this cluster */
936 /* generate numbered filename (there is a %d in trxfn!) */
937 sprintf(buf,trxsfn,cl);
938 trxsout = open_trx(buf,"w");
939 for(i=0; i<nstr; i++) {
942 for(i1=0; i1<i && bWrite[i]; i1++)
944 bWrite[i] = rmsd[structure[i1]][structure[i]] > rmsmin;
946 write_trx(trxsout,iosize,outidx,atoms,i,time[structure[i]],zerobox,
947 xx[structure[i]],NULL,NULL);
951 /* Dump the average structure for this cluster */
953 for(i=0; i<natom; i++)
954 svmul(1.0/nstr,xav[i],xav[i]);
956 for(i=0; i<natom; i++)
957 copy_rvec(xx[midstr][i],xav[i]);
959 reset_x(ifsize,fitidx,natom,NULL,xav,mass);
962 do_fit(natom,mass,xtps,xav);
964 write_trx(trxout,iosize,outidx,atoms,cl,time[midstr],zerobox,xav,NULL,NULL);
979 static void convert_mat(t_matrix *mat,t_mat *rms)
984 matrix2real(mat,rms->mat);
985 /* free input xpm matrix data */
986 for(i=0; i<mat->nx; i++)
987 sfree(mat->matrix[i]);
990 for(i=0; i<mat->nx; i++)
991 for(j=i; j<mat->nx; j++) {
992 rms->sumrms += rms->mat[i][j];
993 rms->maxrms = max(rms->maxrms, rms->mat[i][j]);
995 rms->minrms = min(rms->minrms, rms->mat[i][j]);
1000 int gmx_cluster(int argc,char *argv[])
1002 const char *desc[] = {
1003 "[TT]g_cluster[tt] can cluster structures using several different methods.",
1004 "Distances between structures can be determined from a trajectory",
1005 "or read from an [TT].xpm[tt] matrix file with the [TT]-dm[tt] option.",
1006 "RMS deviation after fitting or RMS deviation of atom-pair distances",
1007 "can be used to define the distance between structures.[PAR]",
1009 "single linkage: add a structure to a cluster when its distance to any",
1010 "element of the cluster is less than [TT]cutoff[tt].[PAR]",
1012 "Jarvis Patrick: add a structure to a cluster when this structure",
1013 "and a structure in the cluster have each other as neighbors and",
1014 "they have a least [TT]P[tt] neighbors in common. The neighbors",
1015 "of a structure are the M closest structures or all structures within",
1016 "[TT]cutoff[tt].[PAR]",
1018 "Monte Carlo: reorder the RMSD matrix using Monte Carlo.[PAR]",
1020 "diagonalization: diagonalize the RMSD matrix.[PAR]",
1022 "gromos: use algorithm as described in Daura [IT]et al.[it]",
1023 "([IT]Angew. Chem. Int. Ed.[it] [BB]1999[bb], [IT]38[it], pp 236-240).",
1024 "Count number of neighbors using cut-off, take structure with",
1025 "largest number of neighbors with all its neighbors as cluster",
1026 "and eliminate it from the pool of clusters. Repeat for remaining",
1027 "structures in pool.[PAR]",
1029 "When the clustering algorithm assigns each structure to exactly one",
1030 "cluster (single linkage, Jarvis Patrick and gromos) and a trajectory",
1031 "file is supplied, the structure with",
1032 "the smallest average distance to the others or the average structure",
1033 "or all structures for each cluster will be written to a trajectory",
1034 "file. When writing all structures, separate numbered files are made",
1035 "for each cluster.[PAR]",
1037 "Two output files are always written:[BR]",
1038 "[TT]-o[tt] writes the RMSD values in the upper left half of the matrix",
1039 "and a graphical depiction of the clusters in the lower right half",
1040 "When [TT]-minstruct[tt] = 1 the graphical depiction is black",
1041 "when two structures are in the same cluster.",
1042 "When [TT]-minstruct[tt] > 1 different colors will be used for each",
1044 "[TT]-g[tt] writes information on the options used and a detailed list",
1045 "of all clusters and their members.[PAR]",
1047 "Additionally, a number of optional output files can be written:[BR]",
1048 "[TT]-dist[tt] writes the RMSD distribution.[BR]",
1049 "[TT]-ev[tt] writes the eigenvectors of the RMSD matrix",
1050 "diagonalization.[BR]",
1051 "[TT]-sz[tt] writes the cluster sizes.[BR]",
1052 "[TT]-tr[tt] writes a matrix of the number transitions between",
1053 "cluster pairs.[BR]",
1054 "[TT]-ntr[tt] writes the total number of transitions to or from",
1055 "each cluster.[BR]",
1056 "[TT]-clid[tt] writes the cluster number as a function of time.[BR]",
1057 "[TT]-cl[tt] writes average (with option [TT]-av[tt]) or central",
1058 "structure of each cluster or writes numbered files with cluster members",
1059 "for a selected set of clusters (with option [TT]-wcl[tt], depends on",
1060 "[TT]-nst[tt] and [TT]-rmsmin[tt]). The center of a cluster is the",
1061 "structure with the smallest average RMSD from all other structures",
1062 "of the cluster.[BR]",
1066 int i,i1,i2,j,nf,nrms;
1069 rvec *xtps,*usextps,*x1,**xx=NULL;
1070 const char *fn,*trx_out_fn;
1077 t_matrix *readmat=NULL;
1080 int isize=0,ifsize=0,iosize=0;
1081 atom_id *index=NULL, *fitidx, *outidx;
1083 real rmsd,**d1,**d2,*time=NULL,time_invfac,*mass=NULL;
1084 char buf[STRLEN],buf1[80],title[STRLEN];
1085 gmx_bool bAnalyze,bUseRmsdCut,bJP_RMSD=FALSE,bReadMat,bReadTraj,bPBC=TRUE;
1087 int method,ncluster=0;
1088 static const char *methodname[] = {
1089 NULL, "linkage", "jarvis-patrick","monte-carlo",
1090 "diagonalization", "gromos", NULL
1092 enum { m_null, m_linkage, m_jarvis_patrick,
1093 m_monte_carlo, m_diagonalize, m_gromos, m_nr };
1094 /* Set colors for plotting: white = zero RMS, black = maximum */
1095 static t_rgb rlo_top = { 1.0, 1.0, 1.0 };
1096 static t_rgb rhi_top = { 0.0, 0.0, 0.0 };
1097 static t_rgb rlo_bot = { 1.0, 1.0, 1.0 };
1098 static t_rgb rhi_bot = { 0.0, 0.0, 1.0 };
1099 static int nlevels=40,skip=1;
1100 static real scalemax=-1.0,rmsdcut=0.1,rmsmin=0.0;
1101 gmx_bool bRMSdist=FALSE,bBinary=FALSE,bAverage=FALSE,bFit=TRUE;
1102 static int niter=10000,seed=1993,write_ncl=0,write_nst=1,minstruct=1;
1103 static real kT=1e-3;
1104 static int M=10,P=3;
1106 gmx_rmpbc_t gpbc=NULL;
1109 { "-dista", FALSE, etBOOL, {&bRMSdist},
1110 "Use RMSD of distances instead of RMS deviation" },
1111 { "-nlevels",FALSE,etINT, {&nlevels},
1112 "Discretize RMSD matrix in this number of levels" },
1113 { "-cutoff",FALSE, etREAL, {&rmsdcut},
1114 "RMSD cut-off (nm) for two structures to be neighbor" },
1115 { "-fit", FALSE, etBOOL, {&bFit},
1116 "Use least squares fitting before RMSD calculation" },
1117 { "-max", FALSE, etREAL, {&scalemax},
1118 "Maximum level in RMSD matrix" },
1119 { "-skip", FALSE, etINT, {&skip},
1120 "Only analyze every nr-th frame" },
1121 { "-av", FALSE, etBOOL, {&bAverage},
1122 "Write average iso middle structure for each cluster" },
1123 { "-wcl", FALSE, etINT, {&write_ncl},
1124 "Write the structures for this number of clusters to numbered files" },
1125 { "-nst", FALSE, etINT, {&write_nst},
1126 "Only write all structures if more than this number of structures per cluster" },
1127 { "-rmsmin",FALSE, etREAL, {&rmsmin},
1128 "minimum rms difference with rest of cluster for writing structures" },
1129 { "-method",FALSE, etENUM, {methodname},
1130 "Method for cluster determination" },
1131 { "-minstruct", FALSE, etINT, {&minstruct},
1132 "Minimum number of structures in cluster for coloring in the [TT].xpm[tt] file" },
1133 { "-binary",FALSE, etBOOL, {&bBinary},
1134 "Treat the RMSD matrix as consisting of 0 and 1, where the cut-off "
1135 "is given by [TT]-cutoff[tt]" },
1136 { "-M", FALSE, etINT, {&M},
1137 "Number of nearest neighbors considered for Jarvis-Patrick algorithm, "
1138 "0 is use cutoff" },
1139 { "-P", FALSE, etINT, {&P},
1140 "Number of identical nearest neighbors required to form a cluster" },
1141 { "-seed", FALSE, etINT, {&seed},
1142 "Random number seed for Monte Carlo clustering algorithm" },
1143 { "-niter", FALSE, etINT, {&niter},
1144 "Number of iterations for MC" },
1145 { "-kT", FALSE, etREAL, {&kT},
1146 "Boltzmann weighting factor for Monte Carlo optimization "
1147 "(zero turns off uphill steps)" },
1148 { "-pbc", FALSE, etBOOL,
1149 { &bPBC }, "PBC check" }
1152 { efTRX, "-f", NULL, ffOPTRD },
1153 { efTPS, "-s", NULL, ffOPTRD },
1154 { efNDX, NULL, NULL, ffOPTRD },
1155 { efXPM, "-dm", "rmsd", ffOPTRD },
1156 { efXPM, "-o", "rmsd-clust", ffWRITE },
1157 { efLOG, "-g", "cluster", ffWRITE },
1158 { efXVG, "-dist", "rmsd-dist", ffOPTWR },
1159 { efXVG, "-ev", "rmsd-eig", ffOPTWR },
1160 { efXVG, "-sz", "clust-size", ffOPTWR},
1161 { efXPM, "-tr", "clust-trans",ffOPTWR},
1162 { efXVG, "-ntr", "clust-trans",ffOPTWR},
1163 { efXVG, "-clid", "clust-id.xvg",ffOPTWR},
1164 { efTRX, "-cl", "clusters.pdb", ffOPTWR }
1166 #define NFILE asize(fnm)
1168 CopyRight(stderr,argv[0]);
1169 parse_common_args(&argc,argv,
1170 PCA_CAN_VIEW | PCA_CAN_TIME | PCA_TIME_UNIT | PCA_BE_NICE,
1171 NFILE,fnm,asize(pa),pa,asize(desc),desc,0,NULL,
1175 bReadMat = opt2bSet("-dm",NFILE,fnm);
1176 bReadTraj = opt2bSet("-f",NFILE,fnm) || !bReadMat;
1177 if ( opt2parg_bSet("-av",asize(pa),pa) ||
1178 opt2parg_bSet("-wcl",asize(pa),pa) ||
1179 opt2parg_bSet("-nst",asize(pa),pa) ||
1180 opt2parg_bSet("-rmsmin",asize(pa),pa) ||
1181 opt2bSet("-cl",NFILE,fnm) )
1182 trx_out_fn = opt2fn("-cl",NFILE,fnm);
1185 if (bReadMat && output_env_get_time_factor(oenv)!=1) {
1187 "\nWarning: assuming the time unit in %s is %s\n",
1188 opt2fn("-dm",NFILE,fnm),output_env_get_time_unit(oenv));
1190 if (trx_out_fn && !bReadTraj)
1191 fprintf(stderr,"\nWarning: "
1192 "cannot write cluster structures without reading trajectory\n"
1193 " ignoring option -cl %s\n", trx_out_fn);
1196 while ( method < m_nr && gmx_strcasecmp(methodname[0], methodname[method])!=0 )
1199 gmx_fatal(FARGS,"Invalid method");
1201 bAnalyze = (method == m_linkage || method == m_jarvis_patrick ||
1202 method == m_gromos );
1205 log = ftp2FILE(efLOG,NFILE,fnm,"w");
1207 fprintf(stderr,"Using %s method for clustering\n",methodname[0]);
1208 fprintf(log,"Using %s method for clustering\n",methodname[0]);
1210 /* check input and write parameters to log file */
1211 bUseRmsdCut = FALSE;
1212 if (method == m_jarvis_patrick) {
1213 bJP_RMSD = (M == 0) || opt2parg_bSet("-cutoff",asize(pa),pa);
1214 if ((M<0) || (M == 1))
1215 gmx_fatal(FARGS,"M (%d) must be 0 or larger than 1",M);
1217 sprintf(buf1,"Will use P=%d and RMSD cutoff (%g)",P,rmsdcut);
1221 gmx_fatal(FARGS,"Number of neighbors required (P) must be less than M");
1223 sprintf(buf1,"Will use P=%d, M=%d and RMSD cutoff (%g)",P,M,rmsdcut);
1226 sprintf(buf1,"Will use P=%d, M=%d",P,M);
1228 ffprintf_s(stderr,log,buf,"%s for determining the neighbors\n\n",buf1);
1229 } else /* method != m_jarvis */
1230 bUseRmsdCut = ( bBinary || method == m_linkage || method == m_gromos );
1231 if (bUseRmsdCut && method != m_jarvis_patrick)
1232 fprintf(log,"Using RMSD cutoff %g nm\n",rmsdcut);
1233 if ( method==m_monte_carlo )
1234 fprintf(log,"Using %d iterations\n",niter);
1237 gmx_fatal(FARGS,"skip (%d) should be >= 1",skip);
1241 /* don't read mass-database as masses (and top) are not used */
1242 read_tps_conf(ftp2fn(efTPS,NFILE,fnm),buf,&top,&ePBC,&xtps,NULL,box,
1245 gpbc = gmx_rmpbc_init(&top.idef,ePBC,top.atoms.nr,box);
1248 fprintf(stderr,"\nSelect group for least squares fit%s:\n",
1249 bReadMat?"":" and RMSD calculation");
1250 get_index(&(top.atoms),ftp2fn_null(efNDX,NFILE,fnm),
1251 1,&ifsize,&fitidx,&grpname);
1253 fprintf(stderr,"\nSelect group for output:\n");
1254 get_index(&(top.atoms),ftp2fn_null(efNDX,NFILE,fnm),
1255 1,&iosize,&outidx,&grpname);
1256 /* merge and convert both index groups: */
1257 /* first copy outidx to index. let outidx refer to elements in index */
1260 for(i=0; i<iosize; i++) {
1264 /* now lookup elements from fitidx in index, add them if necessary
1265 and also let fitidx refer to elements in index */
1266 for(i=0; i<ifsize; i++) {
1268 while (j<isize && index[j]!=fitidx[i])
1271 /* slow this way, but doesn't matter much */
1273 srenew(index,isize);
1278 } else { /* !trx_out_fn */
1281 for(i=0; i<ifsize; i++) {
1287 /* Initiate arrays */
1290 for(i=0; (i<isize); i++) {
1296 /* Loop over first coordinate file */
1297 fn = opt2fn("-f",NFILE,fnm);
1299 xx = read_whole_trj(fn,isize,index,skip,&nf,&time,oenv,bPBC,gpbc);
1300 output_env_conv_times(oenv, nf, time);
1301 if (!bRMSdist || bAnalyze) {
1302 /* Center all frames on zero */
1304 for(i=0; i<ifsize; i++)
1305 mass[fitidx[i]] = top.atoms.atom[index[fitidx[i]]].m;
1308 reset_x(ifsize,fitidx,isize,NULL,xx[i],mass);
1312 gmx_rmpbc_done(gpbc);
1317 fprintf(stderr,"Reading rms distance matrix ");
1318 read_xpm_matrix(opt2fn("-dm",NFILE,fnm),&readmat);
1319 fprintf(stderr,"\n");
1320 if (readmat[0].nx != readmat[0].ny)
1321 gmx_fatal(FARGS,"Matrix (%dx%d) is not square",
1322 readmat[0].nx,readmat[0].ny);
1323 if (bReadTraj && bAnalyze && (readmat[0].nx != nf))
1324 gmx_fatal(FARGS,"Matrix size (%dx%d) does not match the number of "
1325 "frames (%d)",readmat[0].nx,readmat[0].ny,nf);
1329 time = readmat[0].axis_x;
1330 time_invfac = output_env_get_time_invfactor(oenv);
1332 time[i] *= time_invfac;
1334 rms = init_mat(readmat[0].nx,method == m_diagonalize);
1335 convert_mat(&(readmat[0]),rms);
1337 nlevels = readmat[0].nmap;
1338 } else { /* !bReadMat */
1339 rms = init_mat(nf,method == m_diagonalize);
1340 nrms = (nf*(nf-1))/2;
1342 fprintf(stderr,"Computing %dx%d RMS deviation matrix\n",nf,nf);
1344 for(i1=0; (i1<nf); i1++) {
1345 for(i2=i1+1; (i2<nf); i2++) {
1346 for(i=0; i<isize; i++)
1347 copy_rvec(xx[i1][i],x1[i]);
1349 do_fit(isize,mass,xx[i2],x1);
1350 rmsd = rmsdev(isize,mass,xx[i2],x1);
1351 set_mat_entry(rms,i1,i2,rmsd);
1354 fprintf(stderr,"\r# RMSD calculations left: %d ",nrms);
1356 } else { /* bRMSdist */
1357 fprintf(stderr,"Computing %dx%d RMS distance deviation matrix\n",nf,nf);
1358 for(i1=0; (i1<nf); i1++) {
1359 calc_dist(isize,xx[i1],d1);
1360 for(i2=i1+1; (i2<nf); i2++) {
1361 calc_dist(isize,xx[i2],d2);
1362 set_mat_entry(rms,i1,i2,rms_dist(isize,d1,d2));
1365 fprintf(stderr,"\r# RMSD calculations left: %d ",nrms);
1368 fprintf(stderr,"\n\n");
1370 ffprintf_gg(stderr,log,buf,"The RMSD ranges from %g to %g nm\n",
1371 rms->minrms,rms->maxrms);
1372 ffprintf_g(stderr,log,buf,"Average RMSD is %g\n",2*rms->sumrms/(nf*(nf-1)));
1373 ffprintf_d(stderr,log,buf,"Number of structures for matrix %d\n",nf);
1374 ffprintf_g(stderr,log,buf,"Energy of the matrix is %g nm\n",mat_energy(rms));
1375 if (bUseRmsdCut && (rmsdcut < rms->minrms || rmsdcut > rms->maxrms) )
1376 fprintf(stderr,"WARNING: rmsd cutoff %g is outside range of rmsd values "
1377 "%g to %g\n",rmsdcut,rms->minrms,rms->maxrms);
1378 if (bAnalyze && (rmsmin < rms->minrms) )
1379 fprintf(stderr,"WARNING: rmsd minimum %g is below lowest rmsd value %g\n",
1380 rmsmin,rms->minrms);
1381 if (bAnalyze && (rmsmin > rmsdcut) )
1382 fprintf(stderr,"WARNING: rmsd minimum %g is above rmsd cutoff %g\n",
1385 /* Plot the rmsd distribution */
1386 rmsd_distribution(opt2fn("-dist",NFILE,fnm),rms,oenv);
1389 for(i1=0; (i1 < nf); i1++)
1390 for(i2=0; (i2 < nf); i2++)
1391 if (rms->mat[i1][i2] < rmsdcut)
1392 rms->mat[i1][i2] = 0;
1394 rms->mat[i1][i2] = 1;
1400 /* Now sort the matrix and write it out again */
1401 gather(rms,rmsdcut,&clust);
1404 /* Do a diagonalization */
1405 snew(eigenvalues,nf);
1406 snew(eigenvectors,nf*nf);
1407 memcpy(eigenvectors,rms->mat[0],nf*nf*sizeof(real));
1408 eigensolver(eigenvectors,nf,0,nf,eigenvalues,rms->mat[0]);
1409 sfree(eigenvectors);
1411 fp = xvgropen(opt2fn("-ev",NFILE,fnm),"RMSD matrix Eigenvalues",
1412 "Eigenvector index","Eigenvalues (nm\\S2\\N)",oenv);
1413 for(i=0; (i<nf); i++)
1414 fprintf(fp,"%10d %10g\n",i,eigenvalues[i]);
1418 mc_optimize(log,rms,niter,&seed,kT);
1422 case m_jarvis_patrick:
1423 jarvis_patrick(rms->nn,rms->mat,M,P,bJP_RMSD ? rmsdcut : -1,&clust);
1426 gromos(rms->nn,rms->mat,rmsdcut,&clust);
1429 gmx_fatal(FARGS,"DEATH HORROR unknown method \"%s\"",methodname[0]);
1432 if (method == m_monte_carlo || method == m_diagonalize)
1433 fprintf(stderr,"Energy of the matrix after clustering is %g nm\n",
1437 if (minstruct > 1) {
1438 ncluster = plot_clusters(nf,rms->mat,&clust,nlevels,minstruct);
1440 mark_clusters(nf,rms->mat,rms->maxrms,&clust);
1442 init_t_atoms(&useatoms,isize,FALSE);
1443 snew(usextps, isize);
1444 useatoms.resinfo = top.atoms.resinfo;
1445 for(i=0; i<isize; i++) {
1446 useatoms.atomname[i]=top.atoms.atomname[index[i]];
1447 useatoms.atom[i].resind = top.atoms.atom[index[i]].resind;
1448 useatoms.nres = max(useatoms.nres,useatoms.atom[i].resind+1);
1449 copy_rvec(xtps[index[i]],usextps[i]);
1452 analyze_clusters(nf,&clust,rms->mat,isize,&useatoms,usextps,mass,xx,time,
1453 ifsize,fitidx,iosize,outidx,
1454 bReadTraj?trx_out_fn:NULL,
1455 opt2fn_null("-sz",NFILE,fnm),
1456 opt2fn_null("-tr",NFILE,fnm),
1457 opt2fn_null("-ntr",NFILE,fnm),
1458 opt2fn_null("-clid",NFILE,fnm),
1459 bAverage, write_ncl, write_nst, rmsmin, bFit, log,
1460 rlo_bot,rhi_bot,oenv);
1464 if (bBinary && !bAnalyze)
1465 /* Make the clustering visible */
1466 for(i2=0; (i2 < nf); i2++)
1467 for(i1=i2+1; (i1 < nf); i1++)
1468 if (rms->mat[i1][i2])
1469 rms->mat[i1][i2] = rms->maxrms;
1471 fp = opt2FILE("-o",NFILE,fnm,"w");
1472 fprintf(stderr,"Writing rms distance/clustering matrix ");
1474 write_xpm(fp,0,readmat[0].title,readmat[0].legend,readmat[0].label_x,
1475 readmat[0].label_y,nf,nf,readmat[0].axis_x,readmat[0].axis_y,
1476 rms->mat,0.0,rms->maxrms,rlo_top,rhi_top,&nlevels);
1479 sprintf(buf,"Time (%s)",output_env_get_time_unit(oenv));
1480 sprintf(title,"RMS%sDeviation / Cluster Index",
1481 bRMSdist ? " Distance " : " ");
1482 if (minstruct > 1) {
1483 write_xpm_split(fp,0,title,"RMSD (nm)",buf,buf,
1484 nf,nf,time,time,rms->mat,0.0,rms->maxrms,&nlevels,
1485 rlo_top,rhi_top,0.0,(real) ncluster,
1486 &ncluster,TRUE,rlo_bot,rhi_bot);
1488 write_xpm(fp,0,title,"RMSD (nm)",buf,buf,
1489 nf,nf,time,time,rms->mat,0.0,rms->maxrms,
1490 rlo_top,rhi_top,&nlevels);
1493 fprintf(stderr,"\n");
1496 /* now show what we've done */
1497 do_view(oenv,opt2fn("-o",NFILE,fnm),"-nxy");
1498 do_view(oenv,opt2fn_null("-sz",NFILE,fnm),"-nxy");
1499 if (method == m_diagonalize)
1500 do_view(oenv,opt2fn_null("-ev",NFILE,fnm),"-nxy");
1501 do_view(oenv,opt2fn("-dist",NFILE,fnm),"-nxy");
1503 do_view(oenv,opt2fn_null("-tr",NFILE,fnm),"-nxy");
1504 do_view(oenv,opt2fn_null("-ntr",NFILE,fnm),"-nxy");
1505 do_view(oenv,opt2fn_null("-clid",NFILE,fnm),"-nxy");
1508 /* Thank the user for her patience */