2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
60 #include "eigensolver.h"
67 /* print to two file pointers at once (i.e. stderr and log) */
69 void lo_ffprintf(FILE *fp1, FILE *fp2, const char *buf)
71 fprintf(fp1, "%s", buf);
72 fprintf(fp2, "%s", buf);
75 /* just print a prepared buffer to fp1 and fp2 */
77 void ffprintf(FILE *fp1, FILE *fp2, const char *buf)
79 lo_ffprintf(fp1, fp2, buf);
82 /* prepare buffer with one argument, then print to fp1 and fp2 */
84 void ffprintf_d(FILE *fp1, FILE *fp2, char *buf, const char *fmt, int arg)
86 sprintf(buf, fmt, arg);
87 lo_ffprintf(fp1, fp2, buf);
90 /* prepare buffer with one argument, then print to fp1 and fp2 */
92 void ffprintf_g(FILE *fp1, FILE *fp2, char *buf, const char *fmt, real arg)
94 sprintf(buf, fmt, arg);
95 lo_ffprintf(fp1, fp2, buf);
98 /* prepare buffer with one argument, then print to fp1 and fp2 */
100 void ffprintf_s(FILE *fp1, FILE *fp2, char *buf, const char *fmt, const char *arg)
102 sprintf(buf, fmt, arg);
103 lo_ffprintf(fp1, fp2, buf);
106 /* prepare buffer with two arguments, then print to fp1 and fp2 */
108 void ffprintf_dd(FILE *fp1, FILE *fp2, char *buf, const char *fmt, int arg1, int arg2)
110 sprintf(buf, fmt, arg1, arg2);
111 lo_ffprintf(fp1, fp2, buf);
114 /* prepare buffer with two arguments, then print to fp1 and fp2 */
116 void ffprintf_gg(FILE *fp1, FILE *fp2, char *buf, const char *fmt, real arg1, real arg2)
118 sprintf(buf, fmt, arg1, arg2);
119 lo_ffprintf(fp1, fp2, buf);
122 /* prepare buffer with two arguments, then print to fp1 and fp2 */
124 void ffprintf_ss(FILE *fp1, FILE *fp2, char *buf, const char *fmt, const char *arg1, const char *arg2)
126 sprintf(buf, fmt, arg1, arg2);
127 lo_ffprintf(fp1, fp2, buf);
140 void pr_energy(FILE *fp, real e)
142 fprintf(fp, "Energy: %8.4f\n", e);
145 void cp_index(int nn, int from[], int to[])
149 for (i = 0; (i < nn); i++)
155 void mc_optimize(FILE *log, t_mat *m, int maxiter, int *seed, real kT)
157 real e[2], ei, ej, efac;
161 int i, isw, jsw, iisw, jjsw, nn;
163 fprintf(stderr, "\nDoing Monte Carlo clustering\n");
166 cp_index(nn, m->m_ind, low_index);
167 if (getenv("TESTMC"))
169 e[cur] = mat_energy(m);
170 pr_energy(log, e[cur]);
171 fprintf(log, "Doing 1000 random swaps\n");
172 for (i = 0; (i < 1000); i++)
176 isw = nn*rando(seed);
177 jsw = nn*rando(seed);
179 while ((isw == jsw) || (isw >= nn) || (jsw >= nn));
180 iisw = m->m_ind[isw];
181 jjsw = m->m_ind[jsw];
182 m->m_ind[isw] = jjsw;
183 m->m_ind[jsw] = iisw;
186 e[cur] = mat_energy(m);
187 pr_energy(log, e[cur]);
188 for (i = 0; (i < maxiter); i++)
192 isw = nn*rando(seed);
193 jsw = nn*rando(seed);
195 while ((isw == jsw) || (isw >= nn) || (jsw >= nn));
197 iisw = m->m_ind[isw];
198 jjsw = m->m_ind[jsw];
199 ei = row_energy(nn, iisw, m->mat[jsw]);
200 ej = row_energy(nn, jjsw, m->mat[isw]);
202 e[next] = e[cur] + (ei+ej-EROW(m, isw)-EROW(m, jsw))/nn;
204 efac = kT ? exp((e[next]-e[cur])/kT) : -1;
205 if ((e[next] > e[cur]) || (efac > rando(seed)))
208 if (e[next] > e[cur])
210 cp_index(nn, m->m_ind, low_index);
214 fprintf(log, "Taking uphill step\n");
217 /* Now swapping rows */
218 m->m_ind[isw] = jjsw;
219 m->m_ind[jsw] = iisw;
223 fprintf(log, "Iter: %d Swapped %4d and %4d (now %g)",
224 i, isw, jsw, mat_energy(m));
225 pr_energy(log, e[cur]);
228 /* Now restore the highest energy index */
229 cp_index(nn, low_index, m->m_ind);
232 static void calc_dist(int nind, rvec x[], real **d)
238 for (i = 0; (i < nind-1); i++)
241 for (j = i+1; (j < nind); j++)
243 /* Should use pbc_dx when analysing multiple molecueles,
244 * but the box is not stored for every frame.
246 rvec_sub(xi, x[j], dx);
252 static real rms_dist(int isize, real **d, real **d_r)
258 for (i = 0; (i < isize-1); i++)
260 for (j = i+1; (j < isize); j++)
262 r = d[i][j]-d_r[i][j];
266 r2 /= (isize*(isize-1))/2;
271 static int rms_dist_comp(const void *a, const void *b)
278 if (da->dist - db->dist < 0)
282 else if (da->dist - db->dist > 0)
289 static int clust_id_comp(const void *a, const void *b)
296 return da->clust - db->clust;
299 static int nrnb_comp(const void *a, const void *b)
306 /* return the b-a, we want highest first */
307 return db->nr - da->nr;
310 void gather(t_mat *m, real cutoff, t_clusters *clust)
314 int i, j, k, nn, cid, n1, diff;
317 /* First we sort the entries in the RMSD matrix */
321 for (i = k = 0; (i < n1); i++)
323 for (j = i+1; (j < n1); j++, k++)
327 d[k].dist = m->mat[i][j];
332 gmx_incons("gather algortihm");
334 qsort(d, nn, sizeof(d[0]), rms_dist_comp);
336 /* Now we make a cluster index for all of the conformations */
339 /* Now we check the closest structures, and equalize their cluster numbers */
340 fprintf(stderr, "Linking structures ");
343 fprintf(stderr, "*");
345 for (k = 0; (k < nn) && (d[k].dist < cutoff); k++)
347 diff = c[d[k].j].clust - c[d[k].i].clust;
353 c[d[k].j].clust = c[d[k].i].clust;
357 c[d[k].i].clust = c[d[k].j].clust;
363 fprintf(stderr, "\nSorting and renumbering clusters\n");
364 /* Sort on cluster number */
365 qsort(c, n1, sizeof(c[0]), clust_id_comp);
367 /* Renumber clusters */
369 for (k = 1; k < n1; k++)
371 if (c[k].clust != c[k-1].clust)
384 for (k = 0; (k < n1); k++)
386 fprintf(debug, "Cluster index for conformation %d: %d\n",
387 c[k].conf, c[k].clust);
391 for (k = 0; k < n1; k++)
393 clust->cl[c[k].conf] = c[k].clust;
400 gmx_bool jp_same(int **nnb, int i, int j, int P)
406 for (k = 0; nnb[i][k] >= 0; k++)
408 bIn = bIn || (nnb[i][k] == j);
416 for (k = 0; nnb[j][k] >= 0; k++)
418 bIn = bIn || (nnb[j][k] == i);
426 for (ii = 0; nnb[i][ii] >= 0; ii++)
428 for (jj = 0; nnb[j][jj] >= 0; jj++)
430 if ((nnb[i][ii] == nnb[j][jj]) && (nnb[i][ii] != -1))
440 static void jarvis_patrick(int n1, real **mat, int M, int P,
441 real rmsdcut, t_clusters *clust)
446 int i, j, k, cid, diff, max;
455 /* First we sort the entries in the RMSD matrix row by row.
456 * This gives us the nearest neighbor list.
460 for (i = 0; (i < n1); i++)
462 for (j = 0; (j < n1); j++)
465 row[j].dist = mat[i][j];
467 qsort(row, n1, sizeof(row[0]), rms_dist_comp);
470 /* Put the M nearest neighbors in the list */
472 for (j = k = 0; (k < M) && (j < n1) && (mat[i][row[j].j] < rmsdcut); j++)
476 nnb[i][k] = row[j].j;
484 /* Put all neighbors nearer than rmsdcut in the list */
487 for (j = 0; (j < n1) && (mat[i][row[j].j] < rmsdcut); j++)
496 nnb[i][k] = row[j].j;
502 srenew(nnb[i], max+1);
510 fprintf(debug, "Nearest neighborlist. M = %d, P = %d\n", M, P);
511 for (i = 0; (i < n1); i++)
513 fprintf(debug, "i:%5d nbs:", i);
514 for (j = 0; nnb[i][j] >= 0; j++)
516 fprintf(debug, "%5d[%5.3f]", nnb[i][j], mat[i][nnb[i][j]]);
518 fprintf(debug, "\n");
523 fprintf(stderr, "Linking structures ");
524 /* Use mcpy for temporary storage of booleans */
525 mcpy = mk_matrix(n1, n1, FALSE);
526 for (i = 0; i < n1; i++)
528 for (j = i+1; j < n1; j++)
530 mcpy[i][j] = jp_same(nnb, i, j, P);
535 fprintf(stderr, "*");
537 for (i = 0; i < n1; i++)
539 for (j = i+1; j < n1; j++)
543 diff = c[j].clust - c[i].clust;
549 c[j].clust = c[i].clust;
553 c[i].clust = c[j].clust;
562 fprintf(stderr, "\nSorting and renumbering clusters\n");
563 /* Sort on cluster number */
564 qsort(c, n1, sizeof(c[0]), clust_id_comp);
566 /* Renumber clusters */
568 for (k = 1; k < n1; k++)
570 if (c[k].clust != c[k-1].clust)
582 for (k = 0; k < n1; k++)
584 clust->cl[c[k].conf] = c[k].clust;
588 for (k = 0; (k < n1); k++)
590 fprintf(debug, "Cluster index for conformation %d: %d\n",
591 c[k].conf, c[k].clust);
595 /* Again, I don't see the point in this... (AF) */
596 /* for(i=0; (i<n1); i++) { */
597 /* for(j=0; (j<n1); j++) */
598 /* mcpy[c[i].conf][c[j].conf] = mat[i][j]; */
600 /* for(i=0; (i<n1); i++) { */
601 /* for(j=0; (j<n1); j++) */
602 /* mat[i][j] = mcpy[i][j]; */
604 done_matrix(n1, &mcpy);
607 for (i = 0; (i < n1); i++)
614 static void dump_nnb (FILE *fp, const char *title, int n1, t_nnb *nnb)
618 /* dump neighbor list */
619 fprintf(fp, "%s", title);
620 for (i = 0; (i < n1); i++)
622 fprintf(fp, "i:%5d #:%5d nbs:", i, nnb[i].nr);
623 for (j = 0; j < nnb[i].nr; j++)
625 fprintf(fp, "%5d", nnb[i].nb[j]);
631 static void gromos(int n1, real **mat, real rmsdcut, t_clusters *clust)
635 int i, j, k, j1, max;
637 /* Put all neighbors nearer than rmsdcut in the list */
638 fprintf(stderr, "Making list of neighbors within cutoff ");
641 for (i = 0; (i < n1); i++)
645 /* put all neighbors within cut-off in list */
646 for (j = 0; j < n1; j++)
648 if (mat[i][j] < rmsdcut)
653 srenew(nnb[i].nb, max);
659 /* store nr of neighbors, we'll need that */
661 if (i%(1+n1/100) == 0)
663 fprintf(stderr, "%3d%%\b\b\b\b", (i*100+1)/n1);
666 fprintf(stderr, "%3d%%\n", 100);
669 /* sort neighbor list on number of neighbors, largest first */
670 qsort(nnb, n1, sizeof(nnb[0]), nrnb_comp);
674 dump_nnb(debug, "Nearest neighborlist after sort.\n", n1, nnb);
677 /* turn first structure with all its neighbors (largest) into cluster
678 remove them from pool of structures and repeat for all remaining */
679 fprintf(stderr, "Finding clusters %4d", 0);
680 /* cluster id's start at 1: */
684 /* set cluster id (k) for first item in neighborlist */
685 for (j = 0; j < nnb[0].nr; j++)
687 clust->cl[nnb[0].nb[j]] = k;
693 /* adjust number of neighbors for others, taking removals into account: */
694 for (i = 1; i < n1 && nnb[i].nr; i++)
697 for (j = 0; j < nnb[i].nr; j++)
699 /* if this neighbor wasn't removed */
700 if (clust->cl[nnb[i].nb[j]] == 0)
702 /* shift the rest (j1<=j) */
703 nnb[i].nb[j1] = nnb[i].nb[j];
708 /* now j1 is the new number of neighbors */
711 /* sort again on nnb[].nr, because we have new # neighbors: */
712 /* but we only need to sort upto i, i.e. when nnb[].nr>0 */
713 qsort(nnb, i, sizeof(nnb[0]), nrnb_comp);
715 fprintf(stderr, "\b\b\b\b%4d", k);
719 fprintf(stderr, "\n");
723 fprintf(debug, "Clusters (%d):\n", k);
724 for (i = 0; i < n1; i++)
726 fprintf(debug, " %3d", clust->cl[i]);
728 fprintf(debug, "\n");
734 rvec **read_whole_trj(const char *fn, int isize, atom_id index[], int skip,
735 int *nframe, real **time, const output_env_t oenv, gmx_bool bPBC, gmx_rmpbc_t gpbc)
740 int i, i0, j, max_nf;
748 natom = read_first_x(oenv, &status, fn, &t, &x, box);
755 gmx_rmpbc(gpbc, natom, box, x);
761 srenew(*time, max_nf);
766 /* Store only the interesting atoms */
767 for (j = 0; (j < isize); j++)
769 copy_rvec(x[index[j]], xx[i0][j]);
776 while (read_next_x(oenv, status, &t, natom, x, box));
777 fprintf(stderr, "Allocated %lu bytes for frames\n",
778 (unsigned long) (max_nf*isize*sizeof(**xx)));
779 fprintf(stderr, "Read %d frames from trajectory %s\n", i0, fn);
786 static int plot_clusters(int nf, real **mat, t_clusters *clust,
787 int nlevels, int minstruct)
789 int i, j, ncluster, ci;
790 int *cl_id, *nstruct, *strind;
795 for (i = 0; i < nf; i++)
798 cl_id[i] = clust->cl[i];
802 for (i = 0; i < nf; i++)
804 if (nstruct[i] >= minstruct)
807 for (j = 0; (j < nf); j++)
811 strind[j] = ncluster;
817 fprintf(stderr, "There are %d clusters with at least %d conformations\n",
818 ncluster, minstruct);
820 for (i = 0; (i < nf); i++)
823 for (j = 0; j < i; j++)
825 if ((ci == cl_id[j]) && (nstruct[ci] >= minstruct))
827 /* color different clusters with different colors, as long as
828 we don't run out of colors */
829 mat[i][j] = strind[i];
844 static void mark_clusters(int nf, real **mat, real val, t_clusters *clust)
848 for (i = 0; i < nf; i++)
850 for (j = 0; j < i; j++)
852 if (clust->cl[i] == clust->cl[j])
864 static char *parse_filename(const char *fn, int maxnr)
873 gmx_fatal(FARGS, "will not number filename %s containing '%c'", fn, '%');
875 /* number of digits needed in numbering */
876 i = (int)(log(maxnr)/log(10)) + 1;
877 /* split fn and ext */
878 ext = strrchr(fn, '.');
881 gmx_fatal(FARGS, "cannot separate extension in filename %s", fn);
884 /* insert e.g. '%03d' between fn and ext */
885 sprintf(buf, "%s%%0%dd.%s", fn, i, ext);
886 snew(fnout, strlen(buf)+1);
892 static void ana_trans(t_clusters *clust, int nf,
893 const char *transfn, const char *ntransfn, FILE *log,
894 t_rgb rlo, t_rgb rhi, const output_env_t oenv)
899 int i, ntranst, maxtrans;
902 snew(ntrans, clust->ncl);
903 snew(trans, clust->ncl);
904 snew(axis, clust->ncl);
905 for (i = 0; i < clust->ncl; i++)
908 snew(trans[i], clust->ncl);
912 for (i = 1; i < nf; i++)
914 if (clust->cl[i] != clust->cl[i-1])
917 ntrans[clust->cl[i-1]-1]++;
918 ntrans[clust->cl[i]-1]++;
919 trans[clust->cl[i-1]-1][clust->cl[i]-1]++;
920 maxtrans = max(maxtrans, trans[clust->cl[i]-1][clust->cl[i-1]-1]);
923 ffprintf_dd(stderr, log, buf, "Counted %d transitions in total, "
924 "max %d between two specific clusters\n", ntranst, maxtrans);
927 fp = ffopen(transfn, "w");
928 i = min(maxtrans+1, 80);
929 write_xpm(fp, 0, "Cluster Transitions", "# transitions",
930 "from cluster", "to cluster",
931 clust->ncl, clust->ncl, axis, axis, trans,
932 0, maxtrans, rlo, rhi, &i);
937 fp = xvgropen(ntransfn, "Cluster Transitions", "Cluster #", "# transitions",
939 for (i = 0; i < clust->ncl; i++)
941 fprintf(fp, "%5d %5d\n", i+1, ntrans[i]);
946 for (i = 0; i < clust->ncl; i++)
954 static void analyze_clusters(int nf, t_clusters *clust, real **rmsd,
955 int natom, t_atoms *atoms, rvec *xtps,
956 real *mass, rvec **xx, real *time,
957 int ifsize, atom_id *fitidx,
958 int iosize, atom_id *outidx,
959 const char *trxfn, const char *sizefn,
960 const char *transfn, const char *ntransfn,
961 const char *clustidfn, gmx_bool bAverage,
962 int write_ncl, int write_nst, real rmsmin,
963 gmx_bool bFit, FILE *log, t_rgb rlo, t_rgb rhi,
964 const output_env_t oenv)
967 char buf[STRLEN], buf1[40], buf2[40], buf3[40], *trxsfn;
968 t_trxstatus *trxout = NULL;
969 t_trxstatus *trxsout = NULL;
970 int i, i1, cl, nstr, *structure, first = 0, midstr;
971 gmx_bool *bWrite = NULL;
972 real r, clrmsd, midrmsd;
978 ffprintf_d(stderr, log, buf, "\nFound %d clusters\n\n", clust->ncl);
982 /* do we write all structures? */
985 trxsfn = parse_filename(trxfn, max(write_ncl, clust->ncl));
988 ffprintf_ss(stderr, log, buf, "Writing %s structure for each cluster to %s\n",
989 bAverage ? "average" : "middle", trxfn);
992 /* find out what we want to tell the user:
993 Writing [all structures|structures with rmsd > %g] for
994 {all|first %d} clusters {with more than %d structures} to %s */
997 sprintf(buf1, "structures with rmsd > %g", rmsmin);
1001 sprintf(buf1, "all structures");
1003 buf2[0] = buf3[0] = '\0';
1004 if (write_ncl >= clust->ncl)
1008 sprintf(buf2, "all ");
1013 sprintf(buf2, "the first %d ", write_ncl);
1017 sprintf(buf3, " with more than %d structures", write_nst);
1019 sprintf(buf, "Writing %s for %sclusters%s to %s\n", buf1, buf2, buf3, trxsfn);
1020 ffprintf(stderr, log, buf);
1023 /* Prepare a reference structure for the orientation of the clusters */
1026 reset_x(ifsize, fitidx, natom, NULL, xtps, mass);
1028 trxout = open_trx(trxfn, "w");
1029 /* Calculate the average structure in each cluster, *
1030 * all structures are fitted to the first struture of the cluster */
1034 if (transfn || ntransfn)
1036 ana_trans(clust, nf, transfn, ntransfn, log, rlo, rhi, oenv);
1041 fp = xvgropen(clustidfn, "Clusters", output_env_get_xvgr_tlabel(oenv), "Cluster #", oenv);
1042 if(output_env_get_print_xvgr_codes(oenv))
1044 fprintf(fp, "@ s0 symbol 2\n");
1045 fprintf(fp, "@ s0 symbol size 0.2\n");
1046 fprintf(fp, "@ s0 linestyle 0\n");
1048 for (i = 0; i < nf; i++)
1050 fprintf(fp, "%8g %8d\n", time[i], clust->cl[i]);
1056 fp = xvgropen(sizefn, "Cluster Sizes", "Cluster #", "# Structures", oenv);
1057 if(output_env_get_print_xvgr_codes(oenv))
1059 fprintf(fp, "@g%d type %s\n", 0, "bar");
1062 snew(structure, nf);
1063 fprintf(log, "\n%3s | %3s %4s | %6s %4s | cluster members\n",
1064 "cl.", "#st", "rmsd", "middle", "rmsd");
1065 for (cl = 1; cl <= clust->ncl; cl++)
1067 /* prepare structures (fit, middle, average) */
1070 for (i = 0; i < natom; i++)
1076 for (i1 = 0; i1 < nf; i1++)
1078 if (clust->cl[i1] == cl)
1080 structure[nstr] = i1;
1082 if (trxfn && (bAverage || write_ncl) )
1086 reset_x(ifsize, fitidx, natom, NULL, xx[i1], mass);
1094 do_fit(natom, mass, xx[first], xx[i1]);
1098 for (i = 0; i < natom; i++)
1100 rvec_inc(xav[i], xx[i1][i]);
1108 fprintf(fp, "%8d %8d\n", cl, nstr);
1113 for (i1 = 0; i1 < nstr; i1++)
1118 for (i = 0; i < nstr; i++)
1122 r += rmsd[structure[i]][structure[i1]];
1126 r += rmsd[structure[i1]][structure[i]];
1133 midstr = structure[i1];
1140 /* dump cluster info to logfile */
1143 sprintf(buf1, "%6.3f", clrmsd);
1148 sprintf(buf2, "%5.3f", midrmsd);
1156 sprintf(buf1, "%5s", "");
1157 sprintf(buf2, "%5s", "");
1159 fprintf(log, "%3d | %3d %s | %6g%s |", cl, nstr, buf1, time[midstr], buf2);
1160 for (i = 0; i < nstr; i++)
1162 if ((i % 7 == 0) && i)
1164 sprintf(buf, "\n%3s | %3s %4s | %6s %4s |", "", "", "", "", "");
1171 fprintf(log, "%s %6g", buf, time[i1]);
1175 /* write structures to trajectory file(s) */
1180 for (i = 0; i < nstr; i++)
1185 if (cl < write_ncl+1 && nstr > write_nst)
1187 /* Dump all structures for this cluster */
1188 /* generate numbered filename (there is a %d in trxfn!) */
1189 sprintf(buf, trxsfn, cl);
1190 trxsout = open_trx(buf, "w");
1191 for (i = 0; i < nstr; i++)
1196 for (i1 = 0; i1 < i && bWrite[i]; i1++)
1200 bWrite[i] = rmsd[structure[i1]][structure[i]] > rmsmin;
1206 write_trx(trxsout, iosize, outidx, atoms, i, time[structure[i]], zerobox,
1207 xx[structure[i]], NULL, NULL);
1212 /* Dump the average structure for this cluster */
1215 for (i = 0; i < natom; i++)
1217 svmul(1.0/nstr, xav[i], xav[i]);
1222 for (i = 0; i < natom; i++)
1224 copy_rvec(xx[midstr][i], xav[i]);
1228 reset_x(ifsize, fitidx, natom, NULL, xav, mass);
1233 do_fit(natom, mass, xtps, xav);
1236 write_trx(trxout, iosize, outidx, atoms, cl, time[midstr], zerobox, xav, NULL, NULL);
1256 static void convert_mat(t_matrix *mat, t_mat *rms)
1261 matrix2real(mat, rms->mat);
1262 /* free input xpm matrix data */
1263 for (i = 0; i < mat->nx; i++)
1265 sfree(mat->matrix[i]);
1269 for (i = 0; i < mat->nx; i++)
1271 for (j = i; j < mat->nx; j++)
1273 rms->sumrms += rms->mat[i][j];
1274 rms->maxrms = max(rms->maxrms, rms->mat[i][j]);
1277 rms->minrms = min(rms->minrms, rms->mat[i][j]);
1284 int gmx_cluster(int argc, char *argv[])
1286 const char *desc[] = {
1287 "[TT]g_cluster[tt] can cluster structures using several different methods.",
1288 "Distances between structures can be determined from a trajectory",
1289 "or read from an [TT].xpm[tt] matrix file with the [TT]-dm[tt] option.",
1290 "RMS deviation after fitting or RMS deviation of atom-pair distances",
1291 "can be used to define the distance between structures.[PAR]",
1293 "single linkage: add a structure to a cluster when its distance to any",
1294 "element of the cluster is less than [TT]cutoff[tt].[PAR]",
1296 "Jarvis Patrick: add a structure to a cluster when this structure",
1297 "and a structure in the cluster have each other as neighbors and",
1298 "they have a least [TT]P[tt] neighbors in common. The neighbors",
1299 "of a structure are the M closest structures or all structures within",
1300 "[TT]cutoff[tt].[PAR]",
1302 "Monte Carlo: reorder the RMSD matrix using Monte Carlo.[PAR]",
1304 "diagonalization: diagonalize the RMSD matrix.[PAR]",
1306 "gromos: use algorithm as described in Daura [IT]et al.[it]",
1307 "([IT]Angew. Chem. Int. Ed.[it] [BB]1999[bb], [IT]38[it], pp 236-240).",
1308 "Count number of neighbors using cut-off, take structure with",
1309 "largest number of neighbors with all its neighbors as cluster",
1310 "and eliminate it from the pool of clusters. Repeat for remaining",
1311 "structures in pool.[PAR]",
1313 "When the clustering algorithm assigns each structure to exactly one",
1314 "cluster (single linkage, Jarvis Patrick and gromos) and a trajectory",
1315 "file is supplied, the structure with",
1316 "the smallest average distance to the others or the average structure",
1317 "or all structures for each cluster will be written to a trajectory",
1318 "file. When writing all structures, separate numbered files are made",
1319 "for each cluster.[PAR]",
1321 "Two output files are always written:[BR]",
1322 "[TT]-o[tt] writes the RMSD values in the upper left half of the matrix",
1323 "and a graphical depiction of the clusters in the lower right half",
1324 "When [TT]-minstruct[tt] = 1 the graphical depiction is black",
1325 "when two structures are in the same cluster.",
1326 "When [TT]-minstruct[tt] > 1 different colors will be used for each",
1328 "[TT]-g[tt] writes information on the options used and a detailed list",
1329 "of all clusters and their members.[PAR]",
1331 "Additionally, a number of optional output files can be written:[BR]",
1332 "[TT]-dist[tt] writes the RMSD distribution.[BR]",
1333 "[TT]-ev[tt] writes the eigenvectors of the RMSD matrix",
1334 "diagonalization.[BR]",
1335 "[TT]-sz[tt] writes the cluster sizes.[BR]",
1336 "[TT]-tr[tt] writes a matrix of the number transitions between",
1337 "cluster pairs.[BR]",
1338 "[TT]-ntr[tt] writes the total number of transitions to or from",
1339 "each cluster.[BR]",
1340 "[TT]-clid[tt] writes the cluster number as a function of time.[BR]",
1341 "[TT]-cl[tt] writes average (with option [TT]-av[tt]) or central",
1342 "structure of each cluster or writes numbered files with cluster members",
1343 "for a selected set of clusters (with option [TT]-wcl[tt], depends on",
1344 "[TT]-nst[tt] and [TT]-rmsmin[tt]). The center of a cluster is the",
1345 "structure with the smallest average RMSD from all other structures",
1346 "of the cluster.[BR]",
1350 int nf, i, i1, i2, j;
1351 gmx_large_int_t nrms = 0;
1354 rvec *xtps, *usextps, *x1, **xx = NULL;
1355 const char *fn, *trx_out_fn;
1362 t_matrix *readmat = NULL;
1365 int isize = 0, ifsize = 0, iosize = 0;
1366 atom_id *index = NULL, *fitidx, *outidx;
1368 real rmsd, **d1, **d2, *time = NULL, time_invfac, *mass = NULL;
1369 char buf[STRLEN], buf1[80], title[STRLEN];
1370 gmx_bool bAnalyze, bUseRmsdCut, bJP_RMSD = FALSE, bReadMat, bReadTraj, bPBC = TRUE;
1372 int method, ncluster = 0;
1373 static const char *methodname[] = {
1374 NULL, "linkage", "jarvis-patrick", "monte-carlo",
1375 "diagonalization", "gromos", NULL
1378 m_null, m_linkage, m_jarvis_patrick,
1379 m_monte_carlo, m_diagonalize, m_gromos, m_nr
1381 /* Set colors for plotting: white = zero RMS, black = maximum */
1382 static t_rgb rlo_top = { 1.0, 1.0, 1.0 };
1383 static t_rgb rhi_top = { 0.0, 0.0, 0.0 };
1384 static t_rgb rlo_bot = { 1.0, 1.0, 1.0 };
1385 static t_rgb rhi_bot = { 0.0, 0.0, 1.0 };
1386 static int nlevels = 40, skip = 1;
1387 static real scalemax = -1.0, rmsdcut = 0.1, rmsmin = 0.0;
1388 gmx_bool bRMSdist = FALSE, bBinary = FALSE, bAverage = FALSE, bFit = TRUE;
1389 static int niter = 10000, seed = 1993, write_ncl = 0, write_nst = 1, minstruct = 1;
1390 static real kT = 1e-3;
1391 static int M = 10, P = 3;
1393 gmx_rmpbc_t gpbc = NULL;
1396 { "-dista", FALSE, etBOOL, {&bRMSdist},
1397 "Use RMSD of distances instead of RMS deviation" },
1398 { "-nlevels", FALSE, etINT, {&nlevels},
1399 "Discretize RMSD matrix in this number of levels" },
1400 { "-cutoff", FALSE, etREAL, {&rmsdcut},
1401 "RMSD cut-off (nm) for two structures to be neighbor" },
1402 { "-fit", FALSE, etBOOL, {&bFit},
1403 "Use least squares fitting before RMSD calculation" },
1404 { "-max", FALSE, etREAL, {&scalemax},
1405 "Maximum level in RMSD matrix" },
1406 { "-skip", FALSE, etINT, {&skip},
1407 "Only analyze every nr-th frame" },
1408 { "-av", FALSE, etBOOL, {&bAverage},
1409 "Write average iso middle structure for each cluster" },
1410 { "-wcl", FALSE, etINT, {&write_ncl},
1411 "Write the structures for this number of clusters to numbered files" },
1412 { "-nst", FALSE, etINT, {&write_nst},
1413 "Only write all structures if more than this number of structures per cluster" },
1414 { "-rmsmin", FALSE, etREAL, {&rmsmin},
1415 "minimum rms difference with rest of cluster for writing structures" },
1416 { "-method", FALSE, etENUM, {methodname},
1417 "Method for cluster determination" },
1418 { "-minstruct", FALSE, etINT, {&minstruct},
1419 "Minimum number of structures in cluster for coloring in the [TT].xpm[tt] file" },
1420 { "-binary", FALSE, etBOOL, {&bBinary},
1421 "Treat the RMSD matrix as consisting of 0 and 1, where the cut-off "
1422 "is given by [TT]-cutoff[tt]" },
1423 { "-M", FALSE, etINT, {&M},
1424 "Number of nearest neighbors considered for Jarvis-Patrick algorithm, "
1425 "0 is use cutoff" },
1426 { "-P", FALSE, etINT, {&P},
1427 "Number of identical nearest neighbors required to form a cluster" },
1428 { "-seed", FALSE, etINT, {&seed},
1429 "Random number seed for Monte Carlo clustering algorithm" },
1430 { "-niter", FALSE, etINT, {&niter},
1431 "Number of iterations for MC" },
1432 { "-kT", FALSE, etREAL, {&kT},
1433 "Boltzmann weighting factor for Monte Carlo optimization "
1434 "(zero turns off uphill steps)" },
1435 { "-pbc", FALSE, etBOOL,
1436 { &bPBC }, "PBC check" }
1439 { efTRX, "-f", NULL, ffOPTRD },
1440 { efTPS, "-s", NULL, ffOPTRD },
1441 { efNDX, NULL, NULL, ffOPTRD },
1442 { efXPM, "-dm", "rmsd", ffOPTRD },
1443 { efXPM, "-o", "rmsd-clust", ffWRITE },
1444 { efLOG, "-g", "cluster", ffWRITE },
1445 { efXVG, "-dist", "rmsd-dist", ffOPTWR },
1446 { efXVG, "-ev", "rmsd-eig", ffOPTWR },
1447 { efXVG, "-sz", "clust-size", ffOPTWR},
1448 { efXPM, "-tr", "clust-trans", ffOPTWR},
1449 { efXVG, "-ntr", "clust-trans", ffOPTWR},
1450 { efXVG, "-clid", "clust-id.xvg", ffOPTWR},
1451 { efTRX, "-cl", "clusters.pdb", ffOPTWR }
1453 #define NFILE asize(fnm)
1455 CopyRight(stderr, argv[0]);
1456 parse_common_args(&argc, argv,
1457 PCA_CAN_VIEW | PCA_CAN_TIME | PCA_TIME_UNIT | PCA_BE_NICE,
1458 NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL,
1462 bReadMat = opt2bSet("-dm", NFILE, fnm);
1463 bReadTraj = opt2bSet("-f", NFILE, fnm) || !bReadMat;
1464 if (opt2parg_bSet("-av", asize(pa), pa) ||
1465 opt2parg_bSet("-wcl", asize(pa), pa) ||
1466 opt2parg_bSet("-nst", asize(pa), pa) ||
1467 opt2parg_bSet("-rmsmin", asize(pa), pa) ||
1468 opt2bSet("-cl", NFILE, fnm) )
1470 trx_out_fn = opt2fn("-cl", NFILE, fnm);
1476 if (bReadMat && output_env_get_time_factor(oenv) != 1)
1479 "\nWarning: assuming the time unit in %s is %s\n",
1480 opt2fn("-dm", NFILE, fnm), output_env_get_time_unit(oenv));
1482 if (trx_out_fn && !bReadTraj)
1484 fprintf(stderr, "\nWarning: "
1485 "cannot write cluster structures without reading trajectory\n"
1486 " ignoring option -cl %s\n", trx_out_fn);
1490 while (method < m_nr && gmx_strcasecmp(methodname[0], methodname[method]) != 0)
1496 gmx_fatal(FARGS, "Invalid method");
1499 bAnalyze = (method == m_linkage || method == m_jarvis_patrick ||
1500 method == m_gromos );
1503 log = ftp2FILE(efLOG, NFILE, fnm, "w");
1505 fprintf(stderr, "Using %s method for clustering\n", methodname[0]);
1506 fprintf(log, "Using %s method for clustering\n", methodname[0]);
1508 /* check input and write parameters to log file */
1509 bUseRmsdCut = FALSE;
1510 if (method == m_jarvis_patrick)
1512 bJP_RMSD = (M == 0) || opt2parg_bSet("-cutoff", asize(pa), pa);
1513 if ((M < 0) || (M == 1))
1515 gmx_fatal(FARGS, "M (%d) must be 0 or larger than 1", M);
1519 sprintf(buf1, "Will use P=%d and RMSD cutoff (%g)", P, rmsdcut);
1526 gmx_fatal(FARGS, "Number of neighbors required (P) must be less than M");
1530 sprintf(buf1, "Will use P=%d, M=%d and RMSD cutoff (%g)", P, M, rmsdcut);
1535 sprintf(buf1, "Will use P=%d, M=%d", P, M);
1538 ffprintf_s(stderr, log, buf, "%s for determining the neighbors\n\n", buf1);
1540 else /* method != m_jarvis */
1542 bUseRmsdCut = ( bBinary || method == m_linkage || method == m_gromos );
1544 if (bUseRmsdCut && method != m_jarvis_patrick)
1546 fprintf(log, "Using RMSD cutoff %g nm\n", rmsdcut);
1548 if (method == m_monte_carlo)
1550 fprintf(log, "Using %d iterations\n", niter);
1555 gmx_fatal(FARGS, "skip (%d) should be >= 1", skip);
1561 /* don't read mass-database as masses (and top) are not used */
1562 read_tps_conf(ftp2fn(efTPS, NFILE, fnm), buf, &top, &ePBC, &xtps, NULL, box,
1566 gpbc = gmx_rmpbc_init(&top.idef, ePBC, top.atoms.nr, box);
1569 fprintf(stderr, "\nSelect group for least squares fit%s:\n",
1570 bReadMat ? "" : " and RMSD calculation");
1571 get_index(&(top.atoms), ftp2fn_null(efNDX, NFILE, fnm),
1572 1, &ifsize, &fitidx, &grpname);
1575 fprintf(stderr, "\nSelect group for output:\n");
1576 get_index(&(top.atoms), ftp2fn_null(efNDX, NFILE, fnm),
1577 1, &iosize, &outidx, &grpname);
1578 /* merge and convert both index groups: */
1579 /* first copy outidx to index. let outidx refer to elements in index */
1580 snew(index, iosize);
1582 for (i = 0; i < iosize; i++)
1584 index[i] = outidx[i];
1587 /* now lookup elements from fitidx in index, add them if necessary
1588 and also let fitidx refer to elements in index */
1589 for (i = 0; i < ifsize; i++)
1592 while (j < isize && index[j] != fitidx[i])
1598 /* slow this way, but doesn't matter much */
1600 srenew(index, isize);
1602 index[j] = fitidx[i];
1606 else /* !trx_out_fn */
1610 for (i = 0; i < ifsize; i++)
1612 index[i] = fitidx[i];
1620 /* Loop over first coordinate file */
1621 fn = opt2fn("-f", NFILE, fnm);
1623 xx = read_whole_trj(fn, isize, index, skip, &nf, &time, oenv, bPBC, gpbc);
1624 output_env_conv_times(oenv, nf, time);
1625 if (!bRMSdist || bAnalyze)
1627 /* Center all frames on zero */
1629 for (i = 0; i < ifsize; i++)
1631 mass[fitidx[i]] = top.atoms.atom[index[fitidx[i]]].m;
1635 for (i = 0; i < nf; i++)
1637 reset_x(ifsize, fitidx, isize, NULL, xx[i], mass);
1643 gmx_rmpbc_done(gpbc);
1649 fprintf(stderr, "Reading rms distance matrix ");
1650 read_xpm_matrix(opt2fn("-dm", NFILE, fnm), &readmat);
1651 fprintf(stderr, "\n");
1652 if (readmat[0].nx != readmat[0].ny)
1654 gmx_fatal(FARGS, "Matrix (%dx%d) is not square",
1655 readmat[0].nx, readmat[0].ny);
1657 if (bReadTraj && bAnalyze && (readmat[0].nx != nf))
1659 gmx_fatal(FARGS, "Matrix size (%dx%d) does not match the number of "
1660 "frames (%d)", readmat[0].nx, readmat[0].ny, nf);
1665 time = readmat[0].axis_x;
1666 time_invfac = output_env_get_time_invfactor(oenv);
1667 for (i = 0; i < nf; i++)
1669 time[i] *= time_invfac;
1672 rms = init_mat(readmat[0].nx, method == m_diagonalize);
1673 convert_mat(&(readmat[0]), rms);
1675 nlevels = readmat[0].nmap;
1677 else /* !bReadMat */
1679 rms = init_mat(nf, method == m_diagonalize);
1680 nrms = ((gmx_large_int_t)nf*((gmx_large_int_t)nf-1))/2;
1683 fprintf(stderr, "Computing %dx%d RMS deviation matrix\n", nf, nf);
1684 /* Initialize work array */
1686 for (i1 = 0; i1 < nf; i1++)
1688 for (i2 = i1+1; i2 < nf; i2++)
1690 for (i = 0; i < isize; i++)
1692 copy_rvec(xx[i1][i], x1[i]);
1696 do_fit(isize, mass, xx[i2], x1);
1698 rmsd = rmsdev(isize, mass, xx[i2], x1);
1699 set_mat_entry(rms, i1, i2, rmsd);
1701 nrms -= (gmx_large_int_t) (nf-i1-1);
1702 fprintf(stderr, "\r# RMSD calculations left: "gmx_large_int_pfmt" ", nrms);
1708 fprintf(stderr, "Computing %dx%d RMS distance deviation matrix\n", nf, nf);
1710 /* Initiate work arrays */
1713 for (i = 0; (i < isize); i++)
1718 for (i1 = 0; i1 < nf ; i1++)
1720 calc_dist(isize, xx[i1], d1);
1721 for (i2 = i1+1; (i2 < nf); i2++)
1723 calc_dist(isize, xx[i2], d2);
1724 set_mat_entry(rms, i1, i2, rms_dist(isize, d1, d2));
1727 fprintf(stderr, "\r# RMSD calculations left: "gmx_large_int_pfmt" ", nrms);
1729 /* Clean up work arrays */
1730 for (i = 0; (i < isize); i++)
1738 fprintf(stderr, "\n\n");
1740 ffprintf_gg(stderr, log, buf, "The RMSD ranges from %g to %g nm\n",
1741 rms->minrms, rms->maxrms);
1742 ffprintf_g(stderr, log, buf, "Average RMSD is %g\n", 2*rms->sumrms/(nf*(nf-1)));
1743 ffprintf_d(stderr, log, buf, "Number of structures for matrix %d\n", nf);
1744 ffprintf_g(stderr, log, buf, "Energy of the matrix is %g nm\n", mat_energy(rms));
1745 if (bUseRmsdCut && (rmsdcut < rms->minrms || rmsdcut > rms->maxrms) )
1747 fprintf(stderr, "WARNING: rmsd cutoff %g is outside range of rmsd values "
1748 "%g to %g\n", rmsdcut, rms->minrms, rms->maxrms);
1750 if (bAnalyze && (rmsmin < rms->minrms) )
1752 fprintf(stderr, "WARNING: rmsd minimum %g is below lowest rmsd value %g\n",
1753 rmsmin, rms->minrms);
1755 if (bAnalyze && (rmsmin > rmsdcut) )
1757 fprintf(stderr, "WARNING: rmsd minimum %g is above rmsd cutoff %g\n",
1761 /* Plot the rmsd distribution */
1762 rmsd_distribution(opt2fn("-dist", NFILE, fnm), rms, oenv);
1766 for (i1 = 0; (i1 < nf); i1++)
1768 for (i2 = 0; (i2 < nf); i2++)
1770 if (rms->mat[i1][i2] < rmsdcut)
1772 rms->mat[i1][i2] = 0;
1776 rms->mat[i1][i2] = 1;
1786 /* Now sort the matrix and write it out again */
1787 gather(rms, rmsdcut, &clust);
1790 /* Do a diagonalization */
1791 snew(eigenvalues, nf);
1792 snew(eigenvectors, nf*nf);
1793 memcpy(eigenvectors, rms->mat[0], nf*nf*sizeof(real));
1794 eigensolver(eigenvectors, nf, 0, nf, eigenvalues, rms->mat[0]);
1795 sfree(eigenvectors);
1797 fp = xvgropen(opt2fn("-ev", NFILE, fnm), "RMSD matrix Eigenvalues",
1798 "Eigenvector index", "Eigenvalues (nm\\S2\\N)", oenv);
1799 for (i = 0; (i < nf); i++)
1801 fprintf(fp, "%10d %10g\n", i, eigenvalues[i]);
1806 mc_optimize(log, rms, niter, &seed, kT);
1810 case m_jarvis_patrick:
1811 jarvis_patrick(rms->nn, rms->mat, M, P, bJP_RMSD ? rmsdcut : -1, &clust);
1814 gromos(rms->nn, rms->mat, rmsdcut, &clust);
1817 gmx_fatal(FARGS, "DEATH HORROR unknown method \"%s\"", methodname[0]);
1820 if (method == m_monte_carlo || method == m_diagonalize)
1822 fprintf(stderr, "Energy of the matrix after clustering is %g nm\n",
1830 ncluster = plot_clusters(nf, rms->mat, &clust, nlevels, minstruct);
1834 mark_clusters(nf, rms->mat, rms->maxrms, &clust);
1836 init_t_atoms(&useatoms, isize, FALSE);
1837 snew(usextps, isize);
1838 useatoms.resinfo = top.atoms.resinfo;
1839 for (i = 0; i < isize; i++)
1841 useatoms.atomname[i] = top.atoms.atomname[index[i]];
1842 useatoms.atom[i].resind = top.atoms.atom[index[i]].resind;
1843 useatoms.nres = max(useatoms.nres, useatoms.atom[i].resind+1);
1844 copy_rvec(xtps[index[i]], usextps[i]);
1846 useatoms.nr = isize;
1847 analyze_clusters(nf, &clust, rms->mat, isize, &useatoms, usextps, mass, xx, time,
1848 ifsize, fitidx, iosize, outidx,
1849 bReadTraj ? trx_out_fn : NULL,
1850 opt2fn_null("-sz", NFILE, fnm),
1851 opt2fn_null("-tr", NFILE, fnm),
1852 opt2fn_null("-ntr", NFILE, fnm),
1853 opt2fn_null("-clid", NFILE, fnm),
1854 bAverage, write_ncl, write_nst, rmsmin, bFit, log,
1855 rlo_bot, rhi_bot, oenv);
1859 if (bBinary && !bAnalyze)
1861 /* Make the clustering visible */
1862 for (i2 = 0; (i2 < nf); i2++)
1864 for (i1 = i2+1; (i1 < nf); i1++)
1866 if (rms->mat[i1][i2])
1868 rms->mat[i1][i2] = rms->maxrms;
1874 fp = opt2FILE("-o", NFILE, fnm, "w");
1875 fprintf(stderr, "Writing rms distance/clustering matrix ");
1878 write_xpm(fp, 0, readmat[0].title, readmat[0].legend, readmat[0].label_x,
1879 readmat[0].label_y, nf, nf, readmat[0].axis_x, readmat[0].axis_y,
1880 rms->mat, 0.0, rms->maxrms, rlo_top, rhi_top, &nlevels);
1884 sprintf(buf, "Time (%s)", output_env_get_time_unit(oenv));
1885 sprintf(title, "RMS%sDeviation / Cluster Index",
1886 bRMSdist ? " Distance " : " ");
1889 write_xpm_split(fp, 0, title, "RMSD (nm)", buf, buf,
1890 nf, nf, time, time, rms->mat, 0.0, rms->maxrms, &nlevels,
1891 rlo_top, rhi_top, 0.0, (real) ncluster,
1892 &ncluster, TRUE, rlo_bot, rhi_bot);
1896 write_xpm(fp, 0, title, "RMSD (nm)", buf, buf,
1897 nf, nf, time, time, rms->mat, 0.0, rms->maxrms,
1898 rlo_top, rhi_top, &nlevels);
1901 fprintf(stderr, "\n");
1904 /* now show what we've done */
1905 do_view(oenv, opt2fn("-o", NFILE, fnm), "-nxy");
1906 do_view(oenv, opt2fn_null("-sz", NFILE, fnm), "-nxy");
1907 if (method == m_diagonalize)
1909 do_view(oenv, opt2fn_null("-ev", NFILE, fnm), "-nxy");
1911 do_view(oenv, opt2fn("-dist", NFILE, fnm), "-nxy");
1914 do_view(oenv, opt2fn_null("-tr", NFILE, fnm), "-nxy");
1915 do_view(oenv, opt2fn_null("-ntr", NFILE, fnm), "-nxy");
1916 do_view(oenv, opt2fn_null("-clid", NFILE, fnm), "-nxy");
1919 /* Thank the user for her patience */