2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
60 #include "eigensolver.h"
67 /* print to two file pointers at once (i.e. stderr and log) */
69 void lo_ffprintf(FILE *fp1, FILE *fp2, const char *buf)
71 fprintf(fp1, "%s", buf);
72 fprintf(fp2, "%s", buf);
75 /* just print a prepared buffer to fp1 and fp2 */
77 void ffprintf(FILE *fp1, FILE *fp2, const char *buf)
79 lo_ffprintf(fp1, fp2, buf);
82 /* prepare buffer with one argument, then print to fp1 and fp2 */
84 void ffprintf_d(FILE *fp1, FILE *fp2, char *buf, const char *fmt, int arg)
86 sprintf(buf, fmt, arg);
87 lo_ffprintf(fp1, fp2, buf);
90 /* prepare buffer with one argument, then print to fp1 and fp2 */
92 void ffprintf_g(FILE *fp1, FILE *fp2, char *buf, const char *fmt, real arg)
94 sprintf(buf, fmt, arg);
95 lo_ffprintf(fp1, fp2, buf);
98 /* prepare buffer with one argument, then print to fp1 and fp2 */
100 void ffprintf_s(FILE *fp1, FILE *fp2, char *buf, const char *fmt, const char *arg)
102 sprintf(buf, fmt, arg);
103 lo_ffprintf(fp1, fp2, buf);
106 /* prepare buffer with two arguments, then print to fp1 and fp2 */
108 void ffprintf_dd(FILE *fp1, FILE *fp2, char *buf, const char *fmt, int arg1, int arg2)
110 sprintf(buf, fmt, arg1, arg2);
111 lo_ffprintf(fp1, fp2, buf);
114 /* prepare buffer with two arguments, then print to fp1 and fp2 */
116 void ffprintf_gg(FILE *fp1, FILE *fp2, char *buf, const char *fmt, real arg1, real arg2)
118 sprintf(buf, fmt, arg1, arg2);
119 lo_ffprintf(fp1, fp2, buf);
122 /* prepare buffer with two arguments, then print to fp1 and fp2 */
124 void ffprintf_ss(FILE *fp1, FILE *fp2, char *buf, const char *fmt, const char *arg1, const char *arg2)
126 sprintf(buf, fmt, arg1, arg2);
127 lo_ffprintf(fp1, fp2, buf);
140 void pr_energy(FILE *fp, real e)
142 fprintf(fp, "Energy: %8.4f\n", e);
145 void cp_index(int nn, int from[], int to[])
149 for (i = 0; (i < nn); i++)
155 void mc_optimize(FILE *log, t_mat *m, int maxiter, int *seed, real kT)
157 real e[2], ei, ej, efac;
161 int i, isw, jsw, iisw, jjsw, nn;
163 fprintf(stderr, "\nDoing Monte Carlo clustering\n");
166 cp_index(nn, m->m_ind, low_index);
167 if (getenv("TESTMC"))
169 e[cur] = mat_energy(m);
170 pr_energy(log, e[cur]);
171 fprintf(log, "Doing 1000 random swaps\n");
172 for (i = 0; (i < 1000); i++)
176 isw = nn*rando(seed);
177 jsw = nn*rando(seed);
179 while ((isw == jsw) || (isw >= nn) || (jsw >= nn));
180 iisw = m->m_ind[isw];
181 jjsw = m->m_ind[jsw];
182 m->m_ind[isw] = jjsw;
183 m->m_ind[jsw] = iisw;
186 e[cur] = mat_energy(m);
187 pr_energy(log, e[cur]);
188 for (i = 0; (i < maxiter); i++)
192 isw = nn*rando(seed);
193 jsw = nn*rando(seed);
195 while ((isw == jsw) || (isw >= nn) || (jsw >= nn));
197 iisw = m->m_ind[isw];
198 jjsw = m->m_ind[jsw];
199 ei = row_energy(nn, iisw, m->mat[jsw]);
200 ej = row_energy(nn, jjsw, m->mat[isw]);
202 e[next] = e[cur] + (ei+ej-EROW(m, isw)-EROW(m, jsw))/nn;
204 efac = kT ? exp((e[next]-e[cur])/kT) : -1;
205 if ((e[next] > e[cur]) || (efac > rando(seed)))
208 if (e[next] > e[cur])
210 cp_index(nn, m->m_ind, low_index);
214 fprintf(log, "Taking uphill step\n");
217 /* Now swapping rows */
218 m->m_ind[isw] = jjsw;
219 m->m_ind[jsw] = iisw;
223 fprintf(log, "Iter: %d Swapped %4d and %4d (now %g)",
224 i, isw, jsw, mat_energy(m));
225 pr_energy(log, e[cur]);
228 /* Now restore the highest energy index */
229 cp_index(nn, low_index, m->m_ind);
232 static void calc_dist(int nind, rvec x[], real **d)
238 for (i = 0; (i < nind-1); i++)
241 for (j = i+1; (j < nind); j++)
243 /* Should use pbc_dx when analysing multiple molecueles,
244 * but the box is not stored for every frame.
246 rvec_sub(xi, x[j], dx);
252 static real rms_dist(int isize, real **d, real **d_r)
258 for (i = 0; (i < isize-1); i++)
260 for (j = i+1; (j < isize); j++)
262 r = d[i][j]-d_r[i][j];
266 r2 /= (isize*(isize-1))/2;
271 static int rms_dist_comp(const void *a, const void *b)
278 if (da->dist - db->dist < 0)
282 else if (da->dist - db->dist > 0)
289 static int clust_id_comp(const void *a, const void *b)
296 return da->clust - db->clust;
299 static int nrnb_comp(const void *a, const void *b)
306 /* return the b-a, we want highest first */
307 return db->nr - da->nr;
310 void gather(t_mat *m, real cutoff, t_clusters *clust)
314 int i, j, k, nn, cid, n1, diff;
317 /* First we sort the entries in the RMSD matrix */
321 for (i = k = 0; (i < n1); i++)
323 for (j = i+1; (j < n1); j++, k++)
327 d[k].dist = m->mat[i][j];
332 gmx_incons("gather algortihm");
334 qsort(d, nn, sizeof(d[0]), rms_dist_comp);
336 /* Now we make a cluster index for all of the conformations */
339 /* Now we check the closest structures, and equalize their cluster numbers */
340 fprintf(stderr, "Linking structures ");
343 fprintf(stderr, "*");
345 for (k = 0; (k < nn) && (d[k].dist < cutoff); k++)
347 diff = c[d[k].j].clust - c[d[k].i].clust;
353 c[d[k].j].clust = c[d[k].i].clust;
357 c[d[k].i].clust = c[d[k].j].clust;
363 fprintf(stderr, "\nSorting and renumbering clusters\n");
364 /* Sort on cluster number */
365 qsort(c, n1, sizeof(c[0]), clust_id_comp);
367 /* Renumber clusters */
369 for (k = 1; k < n1; k++)
371 if (c[k].clust != c[k-1].clust)
384 for (k = 0; (k < n1); k++)
386 fprintf(debug, "Cluster index for conformation %d: %d\n",
387 c[k].conf, c[k].clust);
391 for (k = 0; k < n1; k++)
393 clust->cl[c[k].conf] = c[k].clust;
400 gmx_bool jp_same(int **nnb, int i, int j, int P)
406 for (k = 0; nnb[i][k] >= 0; k++)
408 bIn = bIn || (nnb[i][k] == j);
416 for (k = 0; nnb[j][k] >= 0; k++)
418 bIn = bIn || (nnb[j][k] == i);
426 for (ii = 0; nnb[i][ii] >= 0; ii++)
428 for (jj = 0; nnb[j][jj] >= 0; jj++)
430 if ((nnb[i][ii] == nnb[j][jj]) && (nnb[i][ii] != -1))
440 static void jarvis_patrick(int n1, real **mat, int M, int P,
441 real rmsdcut, t_clusters *clust)
446 int i, j, k, cid, diff, max;
455 /* First we sort the entries in the RMSD matrix row by row.
456 * This gives us the nearest neighbor list.
460 for (i = 0; (i < n1); i++)
462 for (j = 0; (j < n1); j++)
465 row[j].dist = mat[i][j];
467 qsort(row, n1, sizeof(row[0]), rms_dist_comp);
470 /* Put the M nearest neighbors in the list */
472 for (j = k = 0; (k < M) && (j < n1) && (mat[i][row[j].j] < rmsdcut); j++)
476 nnb[i][k] = row[j].j;
484 /* Put all neighbors nearer than rmsdcut in the list */
487 for (j = 0; (j < n1) && (mat[i][row[j].j] < rmsdcut); j++)
496 nnb[i][k] = row[j].j;
502 srenew(nnb[i], max+1);
510 fprintf(debug, "Nearest neighborlist. M = %d, P = %d\n", M, P);
511 for (i = 0; (i < n1); i++)
513 fprintf(debug, "i:%5d nbs:", i);
514 for (j = 0; nnb[i][j] >= 0; j++)
516 fprintf(debug, "%5d[%5.3f]", nnb[i][j], mat[i][nnb[i][j]]);
518 fprintf(debug, "\n");
523 fprintf(stderr, "Linking structures ");
524 /* Use mcpy for temporary storage of booleans */
525 mcpy = mk_matrix(n1, n1, FALSE);
526 for (i = 0; i < n1; i++)
528 for (j = i+1; j < n1; j++)
530 mcpy[i][j] = jp_same(nnb, i, j, P);
535 fprintf(stderr, "*");
537 for (i = 0; i < n1; i++)
539 for (j = i+1; j < n1; j++)
543 diff = c[j].clust - c[i].clust;
549 c[j].clust = c[i].clust;
553 c[i].clust = c[j].clust;
562 fprintf(stderr, "\nSorting and renumbering clusters\n");
563 /* Sort on cluster number */
564 qsort(c, n1, sizeof(c[0]), clust_id_comp);
566 /* Renumber clusters */
568 for (k = 1; k < n1; k++)
570 if (c[k].clust != c[k-1].clust)
582 for (k = 0; k < n1; k++)
584 clust->cl[c[k].conf] = c[k].clust;
588 for (k = 0; (k < n1); k++)
590 fprintf(debug, "Cluster index for conformation %d: %d\n",
591 c[k].conf, c[k].clust);
595 /* Again, I don't see the point in this... (AF) */
596 /* for(i=0; (i<n1); i++) { */
597 /* for(j=0; (j<n1); j++) */
598 /* mcpy[c[i].conf][c[j].conf] = mat[i][j]; */
600 /* for(i=0; (i<n1); i++) { */
601 /* for(j=0; (j<n1); j++) */
602 /* mat[i][j] = mcpy[i][j]; */
604 done_matrix(n1, &mcpy);
607 for (i = 0; (i < n1); i++)
614 static void dump_nnb (FILE *fp, const char *title, int n1, t_nnb *nnb)
618 /* dump neighbor list */
619 fprintf(fp, "%s", title);
620 for (i = 0; (i < n1); i++)
622 fprintf(fp, "i:%5d #:%5d nbs:", i, nnb[i].nr);
623 for (j = 0; j < nnb[i].nr; j++)
625 fprintf(fp, "%5d", nnb[i].nb[j]);
631 static void gromos(int n1, real **mat, real rmsdcut, t_clusters *clust)
635 int i, j, k, j1, max;
637 /* Put all neighbors nearer than rmsdcut in the list */
638 fprintf(stderr, "Making list of neighbors within cutoff ");
641 for (i = 0; (i < n1); i++)
645 /* put all neighbors within cut-off in list */
646 for (j = 0; j < n1; j++)
648 if (mat[i][j] < rmsdcut)
653 srenew(nnb[i].nb, max);
659 /* store nr of neighbors, we'll need that */
661 if (i%(1+n1/100) == 0)
663 fprintf(stderr, "%3d%%\b\b\b\b", (i*100+1)/n1);
666 fprintf(stderr, "%3d%%\n", 100);
669 /* sort neighbor list on number of neighbors, largest first */
670 qsort(nnb, n1, sizeof(nnb[0]), nrnb_comp);
674 dump_nnb(debug, "Nearest neighborlist after sort.\n", n1, nnb);
677 /* turn first structure with all its neighbors (largest) into cluster
678 remove them from pool of structures and repeat for all remaining */
679 fprintf(stderr, "Finding clusters %4d", 0);
680 /* cluster id's start at 1: */
684 /* set cluster id (k) for first item in neighborlist */
685 for (j = 0; j < nnb[0].nr; j++)
687 clust->cl[nnb[0].nb[j]] = k;
693 /* adjust number of neighbors for others, taking removals into account: */
694 for (i = 1; i < n1 && nnb[i].nr; i++)
697 for (j = 0; j < nnb[i].nr; j++)
699 /* if this neighbor wasn't removed */
700 if (clust->cl[nnb[i].nb[j]] == 0)
702 /* shift the rest (j1<=j) */
703 nnb[i].nb[j1] = nnb[i].nb[j];
708 /* now j1 is the new number of neighbors */
711 /* sort again on nnb[].nr, because we have new # neighbors: */
712 /* but we only need to sort upto i, i.e. when nnb[].nr>0 */
713 qsort(nnb, i, sizeof(nnb[0]), nrnb_comp);
715 fprintf(stderr, "\b\b\b\b%4d", k);
719 fprintf(stderr, "\n");
723 fprintf(debug, "Clusters (%d):\n", k);
724 for (i = 0; i < n1; i++)
726 fprintf(debug, " %3d", clust->cl[i]);
728 fprintf(debug, "\n");
734 rvec **read_whole_trj(const char *fn, int isize, atom_id index[], int skip,
735 int *nframe, real **time, const output_env_t oenv, gmx_bool bPBC, gmx_rmpbc_t gpbc)
740 int i, i0, j, max_nf;
748 natom = read_first_x(oenv, &status, fn, &t, &x, box);
755 gmx_rmpbc(gpbc, natom, box, x);
761 srenew(*time, max_nf);
766 /* Store only the interesting atoms */
767 for (j = 0; (j < isize); j++)
769 copy_rvec(x[index[j]], xx[i0][j]);
776 while (read_next_x(oenv, status, &t, natom, x, box));
777 fprintf(stderr, "Allocated %lu bytes for frames\n",
778 (unsigned long) (max_nf*isize*sizeof(**xx)));
779 fprintf(stderr, "Read %d frames from trajectory %s\n", i0, fn);
786 static int plot_clusters(int nf, real **mat, t_clusters *clust,
787 int nlevels, int minstruct)
789 int i, j, ncluster, ci;
790 int *cl_id, *nstruct, *strind;
795 for (i = 0; i < nf; i++)
798 cl_id[i] = clust->cl[i];
802 for (i = 0; i < nf; i++)
804 if (nstruct[i] >= minstruct)
807 for (j = 0; (j < nf); j++)
811 strind[j] = ncluster;
817 fprintf(stderr, "There are %d clusters with at least %d conformations\n",
818 ncluster, minstruct);
820 for (i = 0; (i < nf); i++)
823 for (j = 0; j < i; j++)
825 if ((ci == cl_id[j]) && (nstruct[ci] >= minstruct))
827 /* color different clusters with different colors, as long as
828 we don't run out of colors */
829 mat[i][j] = strind[i];
844 static void mark_clusters(int nf, real **mat, real val, t_clusters *clust)
848 for (i = 0; i < nf; i++)
850 for (j = 0; j < i; j++)
852 if (clust->cl[i] == clust->cl[j])
864 static char *parse_filename(const char *fn, int maxnr)
873 gmx_fatal(FARGS, "will not number filename %s containing '%c'", fn, '%');
875 /* number of digits needed in numbering */
876 i = (int)(log(maxnr)/log(10)) + 1;
877 /* split fn and ext */
878 ext = strrchr(fn, '.');
881 gmx_fatal(FARGS, "cannot separate extension in filename %s", fn);
884 /* insert e.g. '%03d' between fn and ext */
885 sprintf(buf, "%s%%0%dd.%s", fn, i, ext);
886 snew(fnout, strlen(buf)+1);
892 static void ana_trans(t_clusters *clust, int nf,
893 const char *transfn, const char *ntransfn, FILE *log,
894 t_rgb rlo, t_rgb rhi, const output_env_t oenv)
899 int i, ntranst, maxtrans;
902 snew(ntrans, clust->ncl);
903 snew(trans, clust->ncl);
904 snew(axis, clust->ncl);
905 for (i = 0; i < clust->ncl; i++)
908 snew(trans[i], clust->ncl);
912 for (i = 1; i < nf; i++)
914 if (clust->cl[i] != clust->cl[i-1])
917 ntrans[clust->cl[i-1]-1]++;
918 ntrans[clust->cl[i]-1]++;
919 trans[clust->cl[i-1]-1][clust->cl[i]-1]++;
920 maxtrans = max(maxtrans, trans[clust->cl[i]-1][clust->cl[i-1]-1]);
923 ffprintf_dd(stderr, log, buf, "Counted %d transitions in total, "
924 "max %d between two specific clusters\n", ntranst, maxtrans);
927 fp = ffopen(transfn, "w");
928 i = min(maxtrans+1, 80);
929 write_xpm(fp, 0, "Cluster Transitions", "# transitions",
930 "from cluster", "to cluster",
931 clust->ncl, clust->ncl, axis, axis, trans,
932 0, maxtrans, rlo, rhi, &i);
937 fp = xvgropen(ntransfn, "Cluster Transitions", "Cluster #", "# transitions",
939 for (i = 0; i < clust->ncl; i++)
941 fprintf(fp, "%5d %5d\n", i+1, ntrans[i]);
946 for (i = 0; i < clust->ncl; i++)
954 static void analyze_clusters(int nf, t_clusters *clust, real **rmsd,
955 int natom, t_atoms *atoms, rvec *xtps,
956 real *mass, rvec **xx, real *time,
957 int ifsize, atom_id *fitidx,
958 int iosize, atom_id *outidx,
959 const char *trxfn, const char *sizefn,
960 const char *transfn, const char *ntransfn,
961 const char *clustidfn, gmx_bool bAverage,
962 int write_ncl, int write_nst, real rmsmin,
963 gmx_bool bFit, FILE *log, t_rgb rlo, t_rgb rhi,
964 const output_env_t oenv)
967 char buf[STRLEN], buf1[40], buf2[40], buf3[40], *trxsfn;
968 t_trxstatus *trxout = NULL;
969 t_trxstatus *trxsout = NULL;
970 int i, i1, cl, nstr, *structure, first = 0, midstr;
971 gmx_bool *bWrite = NULL;
972 real r, clrmsd, midrmsd;
978 ffprintf_d(stderr, log, buf, "\nFound %d clusters\n\n", clust->ncl);
982 /* do we write all structures? */
985 trxsfn = parse_filename(trxfn, max(write_ncl, clust->ncl));
988 ffprintf_ss(stderr, log, buf, "Writing %s structure for each cluster to %s\n",
989 bAverage ? "average" : "middle", trxfn);
992 /* find out what we want to tell the user:
993 Writing [all structures|structures with rmsd > %g] for
994 {all|first %d} clusters {with more than %d structures} to %s */
997 sprintf(buf1, "structures with rmsd > %g", rmsmin);
1001 sprintf(buf1, "all structures");
1003 buf2[0] = buf3[0] = '\0';
1004 if (write_ncl >= clust->ncl)
1008 sprintf(buf2, "all ");
1013 sprintf(buf2, "the first %d ", write_ncl);
1017 sprintf(buf3, " with more than %d structures", write_nst);
1019 sprintf(buf, "Writing %s for %sclusters%s to %s\n", buf1, buf2, buf3, trxsfn);
1020 ffprintf(stderr, log, buf);
1023 /* Prepare a reference structure for the orientation of the clusters */
1026 reset_x(ifsize, fitidx, natom, NULL, xtps, mass);
1028 trxout = open_trx(trxfn, "w");
1029 /* Calculate the average structure in each cluster, *
1030 * all structures are fitted to the first struture of the cluster */
1034 if (transfn || ntransfn)
1036 ana_trans(clust, nf, transfn, ntransfn, log, rlo, rhi, oenv);
1041 fp = xvgropen(clustidfn, "Clusters", output_env_get_xvgr_tlabel(oenv), "Cluster #", oenv);
1042 fprintf(fp, "@ s0 symbol 2\n");
1043 fprintf(fp, "@ s0 symbol size 0.2\n");
1044 fprintf(fp, "@ s0 linestyle 0\n");
1045 for (i = 0; i < nf; i++)
1047 fprintf(fp, "%8g %8d\n", time[i], clust->cl[i]);
1053 fp = xvgropen(sizefn, "Cluster Sizes", "Cluster #", "# Structures", oenv);
1054 fprintf(fp, "@g%d type %s\n", 0, "bar");
1056 snew(structure, nf);
1057 fprintf(log, "\n%3s | %3s %4s | %6s %4s | cluster members\n",
1058 "cl.", "#st", "rmsd", "middle", "rmsd");
1059 for (cl = 1; cl <= clust->ncl; cl++)
1061 /* prepare structures (fit, middle, average) */
1064 for (i = 0; i < natom; i++)
1070 for (i1 = 0; i1 < nf; i1++)
1072 if (clust->cl[i1] == cl)
1074 structure[nstr] = i1;
1076 if (trxfn && (bAverage || write_ncl) )
1080 reset_x(ifsize, fitidx, natom, NULL, xx[i1], mass);
1088 do_fit(natom, mass, xx[first], xx[i1]);
1092 for (i = 0; i < natom; i++)
1094 rvec_inc(xav[i], xx[i1][i]);
1102 fprintf(fp, "%8d %8d\n", cl, nstr);
1107 for (i1 = 0; i1 < nstr; i1++)
1112 for (i = 0; i < nstr; i++)
1116 r += rmsd[structure[i]][structure[i1]];
1120 r += rmsd[structure[i1]][structure[i]];
1127 midstr = structure[i1];
1134 /* dump cluster info to logfile */
1137 sprintf(buf1, "%6.3f", clrmsd);
1142 sprintf(buf2, "%5.3f", midrmsd);
1150 sprintf(buf1, "%5s", "");
1151 sprintf(buf2, "%5s", "");
1153 fprintf(log, "%3d | %3d %s | %6g%s |", cl, nstr, buf1, time[midstr], buf2);
1154 for (i = 0; i < nstr; i++)
1156 if ((i % 7 == 0) && i)
1158 sprintf(buf, "\n%3s | %3s %4s | %6s %4s |", "", "", "", "", "");
1165 fprintf(log, "%s %6g", buf, time[i1]);
1169 /* write structures to trajectory file(s) */
1174 for (i = 0; i < nstr; i++)
1179 if (cl < write_ncl+1 && nstr > write_nst)
1181 /* Dump all structures for this cluster */
1182 /* generate numbered filename (there is a %d in trxfn!) */
1183 sprintf(buf, trxsfn, cl);
1184 trxsout = open_trx(buf, "w");
1185 for (i = 0; i < nstr; i++)
1190 for (i1 = 0; i1 < i && bWrite[i]; i1++)
1194 bWrite[i] = rmsd[structure[i1]][structure[i]] > rmsmin;
1200 write_trx(trxsout, iosize, outidx, atoms, i, time[structure[i]], zerobox,
1201 xx[structure[i]], NULL, NULL);
1206 /* Dump the average structure for this cluster */
1209 for (i = 0; i < natom; i++)
1211 svmul(1.0/nstr, xav[i], xav[i]);
1216 for (i = 0; i < natom; i++)
1218 copy_rvec(xx[midstr][i], xav[i]);
1222 reset_x(ifsize, fitidx, natom, NULL, xav, mass);
1227 do_fit(natom, mass, xtps, xav);
1230 write_trx(trxout, iosize, outidx, atoms, cl, time[midstr], zerobox, xav, NULL, NULL);
1250 static void convert_mat(t_matrix *mat, t_mat *rms)
1255 matrix2real(mat, rms->mat);
1256 /* free input xpm matrix data */
1257 for (i = 0; i < mat->nx; i++)
1259 sfree(mat->matrix[i]);
1263 for (i = 0; i < mat->nx; i++)
1265 for (j = i; j < mat->nx; j++)
1267 rms->sumrms += rms->mat[i][j];
1268 rms->maxrms = max(rms->maxrms, rms->mat[i][j]);
1271 rms->minrms = min(rms->minrms, rms->mat[i][j]);
1278 int gmx_cluster(int argc, char *argv[])
1280 const char *desc[] = {
1281 "[TT]g_cluster[tt] can cluster structures using several different methods.",
1282 "Distances between structures can be determined from a trajectory",
1283 "or read from an [TT].xpm[tt] matrix file with the [TT]-dm[tt] option.",
1284 "RMS deviation after fitting or RMS deviation of atom-pair distances",
1285 "can be used to define the distance between structures.[PAR]",
1287 "single linkage: add a structure to a cluster when its distance to any",
1288 "element of the cluster is less than [TT]cutoff[tt].[PAR]",
1290 "Jarvis Patrick: add a structure to a cluster when this structure",
1291 "and a structure in the cluster have each other as neighbors and",
1292 "they have a least [TT]P[tt] neighbors in common. The neighbors",
1293 "of a structure are the M closest structures or all structures within",
1294 "[TT]cutoff[tt].[PAR]",
1296 "Monte Carlo: reorder the RMSD matrix using Monte Carlo.[PAR]",
1298 "diagonalization: diagonalize the RMSD matrix.[PAR]",
1300 "gromos: use algorithm as described in Daura [IT]et al.[it]",
1301 "([IT]Angew. Chem. Int. Ed.[it] [BB]1999[bb], [IT]38[it], pp 236-240).",
1302 "Count number of neighbors using cut-off, take structure with",
1303 "largest number of neighbors with all its neighbors as cluster",
1304 "and eliminate it from the pool of clusters. Repeat for remaining",
1305 "structures in pool.[PAR]",
1307 "When the clustering algorithm assigns each structure to exactly one",
1308 "cluster (single linkage, Jarvis Patrick and gromos) and a trajectory",
1309 "file is supplied, the structure with",
1310 "the smallest average distance to the others or the average structure",
1311 "or all structures for each cluster will be written to a trajectory",
1312 "file. When writing all structures, separate numbered files are made",
1313 "for each cluster.[PAR]",
1315 "Two output files are always written:[BR]",
1316 "[TT]-o[tt] writes the RMSD values in the upper left half of the matrix",
1317 "and a graphical depiction of the clusters in the lower right half",
1318 "When [TT]-minstruct[tt] = 1 the graphical depiction is black",
1319 "when two structures are in the same cluster.",
1320 "When [TT]-minstruct[tt] > 1 different colors will be used for each",
1322 "[TT]-g[tt] writes information on the options used and a detailed list",
1323 "of all clusters and their members.[PAR]",
1325 "Additionally, a number of optional output files can be written:[BR]",
1326 "[TT]-dist[tt] writes the RMSD distribution.[BR]",
1327 "[TT]-ev[tt] writes the eigenvectors of the RMSD matrix",
1328 "diagonalization.[BR]",
1329 "[TT]-sz[tt] writes the cluster sizes.[BR]",
1330 "[TT]-tr[tt] writes a matrix of the number transitions between",
1331 "cluster pairs.[BR]",
1332 "[TT]-ntr[tt] writes the total number of transitions to or from",
1333 "each cluster.[BR]",
1334 "[TT]-clid[tt] writes the cluster number as a function of time.[BR]",
1335 "[TT]-cl[tt] writes average (with option [TT]-av[tt]) or central",
1336 "structure of each cluster or writes numbered files with cluster members",
1337 "for a selected set of clusters (with option [TT]-wcl[tt], depends on",
1338 "[TT]-nst[tt] and [TT]-rmsmin[tt]). The center of a cluster is the",
1339 "structure with the smallest average RMSD from all other structures",
1340 "of the cluster.[BR]",
1344 int nf, i, i1, i2, j;
1345 gmx_large_int_t nrms = 0;
1348 rvec *xtps, *usextps, *x1, **xx = NULL;
1349 const char *fn, *trx_out_fn;
1356 t_matrix *readmat = NULL;
1359 int isize = 0, ifsize = 0, iosize = 0;
1360 atom_id *index = NULL, *fitidx, *outidx;
1362 real rmsd, **d1, **d2, *time = NULL, time_invfac, *mass = NULL;
1363 char buf[STRLEN], buf1[80], title[STRLEN];
1364 gmx_bool bAnalyze, bUseRmsdCut, bJP_RMSD = FALSE, bReadMat, bReadTraj, bPBC = TRUE;
1366 int method, ncluster = 0;
1367 static const char *methodname[] = {
1368 NULL, "linkage", "jarvis-patrick", "monte-carlo",
1369 "diagonalization", "gromos", NULL
1372 m_null, m_linkage, m_jarvis_patrick,
1373 m_monte_carlo, m_diagonalize, m_gromos, m_nr
1375 /* Set colors for plotting: white = zero RMS, black = maximum */
1376 static t_rgb rlo_top = { 1.0, 1.0, 1.0 };
1377 static t_rgb rhi_top = { 0.0, 0.0, 0.0 };
1378 static t_rgb rlo_bot = { 1.0, 1.0, 1.0 };
1379 static t_rgb rhi_bot = { 0.0, 0.0, 1.0 };
1380 static int nlevels = 40, skip = 1;
1381 static real scalemax = -1.0, rmsdcut = 0.1, rmsmin = 0.0;
1382 gmx_bool bRMSdist = FALSE, bBinary = FALSE, bAverage = FALSE, bFit = TRUE;
1383 static int niter = 10000, seed = 1993, write_ncl = 0, write_nst = 1, minstruct = 1;
1384 static real kT = 1e-3;
1385 static int M = 10, P = 3;
1387 gmx_rmpbc_t gpbc = NULL;
1390 { "-dista", FALSE, etBOOL, {&bRMSdist},
1391 "Use RMSD of distances instead of RMS deviation" },
1392 { "-nlevels", FALSE, etINT, {&nlevels},
1393 "Discretize RMSD matrix in this number of levels" },
1394 { "-cutoff", FALSE, etREAL, {&rmsdcut},
1395 "RMSD cut-off (nm) for two structures to be neighbor" },
1396 { "-fit", FALSE, etBOOL, {&bFit},
1397 "Use least squares fitting before RMSD calculation" },
1398 { "-max", FALSE, etREAL, {&scalemax},
1399 "Maximum level in RMSD matrix" },
1400 { "-skip", FALSE, etINT, {&skip},
1401 "Only analyze every nr-th frame" },
1402 { "-av", FALSE, etBOOL, {&bAverage},
1403 "Write average iso middle structure for each cluster" },
1404 { "-wcl", FALSE, etINT, {&write_ncl},
1405 "Write the structures for this number of clusters to numbered files" },
1406 { "-nst", FALSE, etINT, {&write_nst},
1407 "Only write all structures if more than this number of structures per cluster" },
1408 { "-rmsmin", FALSE, etREAL, {&rmsmin},
1409 "minimum rms difference with rest of cluster for writing structures" },
1410 { "-method", FALSE, etENUM, {methodname},
1411 "Method for cluster determination" },
1412 { "-minstruct", FALSE, etINT, {&minstruct},
1413 "Minimum number of structures in cluster for coloring in the [TT].xpm[tt] file" },
1414 { "-binary", FALSE, etBOOL, {&bBinary},
1415 "Treat the RMSD matrix as consisting of 0 and 1, where the cut-off "
1416 "is given by [TT]-cutoff[tt]" },
1417 { "-M", FALSE, etINT, {&M},
1418 "Number of nearest neighbors considered for Jarvis-Patrick algorithm, "
1419 "0 is use cutoff" },
1420 { "-P", FALSE, etINT, {&P},
1421 "Number of identical nearest neighbors required to form a cluster" },
1422 { "-seed", FALSE, etINT, {&seed},
1423 "Random number seed for Monte Carlo clustering algorithm" },
1424 { "-niter", FALSE, etINT, {&niter},
1425 "Number of iterations for MC" },
1426 { "-kT", FALSE, etREAL, {&kT},
1427 "Boltzmann weighting factor for Monte Carlo optimization "
1428 "(zero turns off uphill steps)" },
1429 { "-pbc", FALSE, etBOOL,
1430 { &bPBC }, "PBC check" }
1433 { efTRX, "-f", NULL, ffOPTRD },
1434 { efTPS, "-s", NULL, ffOPTRD },
1435 { efNDX, NULL, NULL, ffOPTRD },
1436 { efXPM, "-dm", "rmsd", ffOPTRD },
1437 { efXPM, "-o", "rmsd-clust", ffWRITE },
1438 { efLOG, "-g", "cluster", ffWRITE },
1439 { efXVG, "-dist", "rmsd-dist", ffOPTWR },
1440 { efXVG, "-ev", "rmsd-eig", ffOPTWR },
1441 { efXVG, "-sz", "clust-size", ffOPTWR},
1442 { efXPM, "-tr", "clust-trans", ffOPTWR},
1443 { efXVG, "-ntr", "clust-trans", ffOPTWR},
1444 { efXVG, "-clid", "clust-id.xvg", ffOPTWR},
1445 { efTRX, "-cl", "clusters.pdb", ffOPTWR }
1447 #define NFILE asize(fnm)
1449 CopyRight(stderr, argv[0]);
1450 parse_common_args(&argc, argv,
1451 PCA_CAN_VIEW | PCA_CAN_TIME | PCA_TIME_UNIT | PCA_BE_NICE,
1452 NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, NULL,
1456 bReadMat = opt2bSet("-dm", NFILE, fnm);
1457 bReadTraj = opt2bSet("-f", NFILE, fnm) || !bReadMat;
1458 if (opt2parg_bSet("-av", asize(pa), pa) ||
1459 opt2parg_bSet("-wcl", asize(pa), pa) ||
1460 opt2parg_bSet("-nst", asize(pa), pa) ||
1461 opt2parg_bSet("-rmsmin", asize(pa), pa) ||
1462 opt2bSet("-cl", NFILE, fnm) )
1464 trx_out_fn = opt2fn("-cl", NFILE, fnm);
1470 if (bReadMat && output_env_get_time_factor(oenv) != 1)
1473 "\nWarning: assuming the time unit in %s is %s\n",
1474 opt2fn("-dm", NFILE, fnm), output_env_get_time_unit(oenv));
1476 if (trx_out_fn && !bReadTraj)
1478 fprintf(stderr, "\nWarning: "
1479 "cannot write cluster structures without reading trajectory\n"
1480 " ignoring option -cl %s\n", trx_out_fn);
1484 while (method < m_nr && gmx_strcasecmp(methodname[0], methodname[method]) != 0)
1490 gmx_fatal(FARGS, "Invalid method");
1493 bAnalyze = (method == m_linkage || method == m_jarvis_patrick ||
1494 method == m_gromos );
1497 log = ftp2FILE(efLOG, NFILE, fnm, "w");
1499 fprintf(stderr, "Using %s method for clustering\n", methodname[0]);
1500 fprintf(log, "Using %s method for clustering\n", methodname[0]);
1502 /* check input and write parameters to log file */
1503 bUseRmsdCut = FALSE;
1504 if (method == m_jarvis_patrick)
1506 bJP_RMSD = (M == 0) || opt2parg_bSet("-cutoff", asize(pa), pa);
1507 if ((M < 0) || (M == 1))
1509 gmx_fatal(FARGS, "M (%d) must be 0 or larger than 1", M);
1513 sprintf(buf1, "Will use P=%d and RMSD cutoff (%g)", P, rmsdcut);
1520 gmx_fatal(FARGS, "Number of neighbors required (P) must be less than M");
1524 sprintf(buf1, "Will use P=%d, M=%d and RMSD cutoff (%g)", P, M, rmsdcut);
1529 sprintf(buf1, "Will use P=%d, M=%d", P, M);
1532 ffprintf_s(stderr, log, buf, "%s for determining the neighbors\n\n", buf1);
1534 else /* method != m_jarvis */
1536 bUseRmsdCut = ( bBinary || method == m_linkage || method == m_gromos );
1538 if (bUseRmsdCut && method != m_jarvis_patrick)
1540 fprintf(log, "Using RMSD cutoff %g nm\n", rmsdcut);
1542 if (method == m_monte_carlo)
1544 fprintf(log, "Using %d iterations\n", niter);
1549 gmx_fatal(FARGS, "skip (%d) should be >= 1", skip);
1555 /* don't read mass-database as masses (and top) are not used */
1556 read_tps_conf(ftp2fn(efTPS, NFILE, fnm), buf, &top, &ePBC, &xtps, NULL, box,
1560 gpbc = gmx_rmpbc_init(&top.idef, ePBC, top.atoms.nr, box);
1563 fprintf(stderr, "\nSelect group for least squares fit%s:\n",
1564 bReadMat ? "" : " and RMSD calculation");
1565 get_index(&(top.atoms), ftp2fn_null(efNDX, NFILE, fnm),
1566 1, &ifsize, &fitidx, &grpname);
1569 fprintf(stderr, "\nSelect group for output:\n");
1570 get_index(&(top.atoms), ftp2fn_null(efNDX, NFILE, fnm),
1571 1, &iosize, &outidx, &grpname);
1572 /* merge and convert both index groups: */
1573 /* first copy outidx to index. let outidx refer to elements in index */
1574 snew(index, iosize);
1576 for (i = 0; i < iosize; i++)
1578 index[i] = outidx[i];
1581 /* now lookup elements from fitidx in index, add them if necessary
1582 and also let fitidx refer to elements in index */
1583 for (i = 0; i < ifsize; i++)
1586 while (j < isize && index[j] != fitidx[i])
1592 /* slow this way, but doesn't matter much */
1594 srenew(index, isize);
1596 index[j] = fitidx[i];
1600 else /* !trx_out_fn */
1604 for (i = 0; i < ifsize; i++)
1606 index[i] = fitidx[i];
1614 /* Loop over first coordinate file */
1615 fn = opt2fn("-f", NFILE, fnm);
1617 xx = read_whole_trj(fn, isize, index, skip, &nf, &time, oenv, bPBC, gpbc);
1618 output_env_conv_times(oenv, nf, time);
1619 if (!bRMSdist || bAnalyze)
1621 /* Center all frames on zero */
1623 for (i = 0; i < ifsize; i++)
1625 mass[fitidx[i]] = top.atoms.atom[index[fitidx[i]]].m;
1629 for (i = 0; i < nf; i++)
1631 reset_x(ifsize, fitidx, isize, NULL, xx[i], mass);
1637 gmx_rmpbc_done(gpbc);
1643 fprintf(stderr, "Reading rms distance matrix ");
1644 read_xpm_matrix(opt2fn("-dm", NFILE, fnm), &readmat);
1645 fprintf(stderr, "\n");
1646 if (readmat[0].nx != readmat[0].ny)
1648 gmx_fatal(FARGS, "Matrix (%dx%d) is not square",
1649 readmat[0].nx, readmat[0].ny);
1651 if (bReadTraj && bAnalyze && (readmat[0].nx != nf))
1653 gmx_fatal(FARGS, "Matrix size (%dx%d) does not match the number of "
1654 "frames (%d)", readmat[0].nx, readmat[0].ny, nf);
1659 time = readmat[0].axis_x;
1660 time_invfac = output_env_get_time_invfactor(oenv);
1661 for (i = 0; i < nf; i++)
1663 time[i] *= time_invfac;
1666 rms = init_mat(readmat[0].nx, method == m_diagonalize);
1667 convert_mat(&(readmat[0]), rms);
1669 nlevels = readmat[0].nmap;
1671 else /* !bReadMat */
1673 rms = init_mat(nf, method == m_diagonalize);
1674 nrms = ((gmx_large_int_t)nf*((gmx_large_int_t)nf-1))/2;
1677 fprintf(stderr, "Computing %dx%d RMS deviation matrix\n", nf, nf);
1678 /* Initialize work array */
1680 for (i1 = 0; i1 < nf; i1++)
1682 for (i2 = i1+1; i2 < nf; i2++)
1684 for (i = 0; i < isize; i++)
1686 copy_rvec(xx[i1][i], x1[i]);
1690 do_fit(isize, mass, xx[i2], x1);
1692 rmsd = rmsdev(isize, mass, xx[i2], x1);
1693 set_mat_entry(rms, i1, i2, rmsd);
1695 nrms -= (gmx_large_int_t) (nf-i1-1);
1696 fprintf(stderr, "\r# RMSD calculations left: "gmx_large_int_pfmt" ", nrms);
1702 fprintf(stderr, "Computing %dx%d RMS distance deviation matrix\n", nf, nf);
1704 /* Initiate work arrays */
1707 for (i = 0; (i < isize); i++)
1712 for (i1 = 0; i1 < nf ; i1++)
1714 calc_dist(isize, xx[i1], d1);
1715 for (i2 = i1+1; (i2 < nf); i2++)
1717 calc_dist(isize, xx[i2], d2);
1718 set_mat_entry(rms, i1, i2, rms_dist(isize, d1, d2));
1721 fprintf(stderr, "\r# RMSD calculations left: "gmx_large_int_pfmt" ", nrms);
1723 /* Clean up work arrays */
1724 for (i = 0; (i < isize); i++)
1732 fprintf(stderr, "\n\n");
1734 ffprintf_gg(stderr, log, buf, "The RMSD ranges from %g to %g nm\n",
1735 rms->minrms, rms->maxrms);
1736 ffprintf_g(stderr, log, buf, "Average RMSD is %g\n", 2*rms->sumrms/(nf*(nf-1)));
1737 ffprintf_d(stderr, log, buf, "Number of structures for matrix %d\n", nf);
1738 ffprintf_g(stderr, log, buf, "Energy of the matrix is %g nm\n", mat_energy(rms));
1739 if (bUseRmsdCut && (rmsdcut < rms->minrms || rmsdcut > rms->maxrms) )
1741 fprintf(stderr, "WARNING: rmsd cutoff %g is outside range of rmsd values "
1742 "%g to %g\n", rmsdcut, rms->minrms, rms->maxrms);
1744 if (bAnalyze && (rmsmin < rms->minrms) )
1746 fprintf(stderr, "WARNING: rmsd minimum %g is below lowest rmsd value %g\n",
1747 rmsmin, rms->minrms);
1749 if (bAnalyze && (rmsmin > rmsdcut) )
1751 fprintf(stderr, "WARNING: rmsd minimum %g is above rmsd cutoff %g\n",
1755 /* Plot the rmsd distribution */
1756 rmsd_distribution(opt2fn("-dist", NFILE, fnm), rms, oenv);
1760 for (i1 = 0; (i1 < nf); i1++)
1762 for (i2 = 0; (i2 < nf); i2++)
1764 if (rms->mat[i1][i2] < rmsdcut)
1766 rms->mat[i1][i2] = 0;
1770 rms->mat[i1][i2] = 1;
1780 /* Now sort the matrix and write it out again */
1781 gather(rms, rmsdcut, &clust);
1784 /* Do a diagonalization */
1785 snew(eigenvalues, nf);
1786 snew(eigenvectors, nf*nf);
1787 memcpy(eigenvectors, rms->mat[0], nf*nf*sizeof(real));
1788 eigensolver(eigenvectors, nf, 0, nf, eigenvalues, rms->mat[0]);
1789 sfree(eigenvectors);
1791 fp = xvgropen(opt2fn("-ev", NFILE, fnm), "RMSD matrix Eigenvalues",
1792 "Eigenvector index", "Eigenvalues (nm\\S2\\N)", oenv);
1793 for (i = 0; (i < nf); i++)
1795 fprintf(fp, "%10d %10g\n", i, eigenvalues[i]);
1800 mc_optimize(log, rms, niter, &seed, kT);
1804 case m_jarvis_patrick:
1805 jarvis_patrick(rms->nn, rms->mat, M, P, bJP_RMSD ? rmsdcut : -1, &clust);
1808 gromos(rms->nn, rms->mat, rmsdcut, &clust);
1811 gmx_fatal(FARGS, "DEATH HORROR unknown method \"%s\"", methodname[0]);
1814 if (method == m_monte_carlo || method == m_diagonalize)
1816 fprintf(stderr, "Energy of the matrix after clustering is %g nm\n",
1824 ncluster = plot_clusters(nf, rms->mat, &clust, nlevels, minstruct);
1828 mark_clusters(nf, rms->mat, rms->maxrms, &clust);
1830 init_t_atoms(&useatoms, isize, FALSE);
1831 snew(usextps, isize);
1832 useatoms.resinfo = top.atoms.resinfo;
1833 for (i = 0; i < isize; i++)
1835 useatoms.atomname[i] = top.atoms.atomname[index[i]];
1836 useatoms.atom[i].resind = top.atoms.atom[index[i]].resind;
1837 useatoms.nres = max(useatoms.nres, useatoms.atom[i].resind+1);
1838 copy_rvec(xtps[index[i]], usextps[i]);
1840 useatoms.nr = isize;
1841 analyze_clusters(nf, &clust, rms->mat, isize, &useatoms, usextps, mass, xx, time,
1842 ifsize, fitidx, iosize, outidx,
1843 bReadTraj ? trx_out_fn : NULL,
1844 opt2fn_null("-sz", NFILE, fnm),
1845 opt2fn_null("-tr", NFILE, fnm),
1846 opt2fn_null("-ntr", NFILE, fnm),
1847 opt2fn_null("-clid", NFILE, fnm),
1848 bAverage, write_ncl, write_nst, rmsmin, bFit, log,
1849 rlo_bot, rhi_bot, oenv);
1853 if (bBinary && !bAnalyze)
1855 /* Make the clustering visible */
1856 for (i2 = 0; (i2 < nf); i2++)
1858 for (i1 = i2+1; (i1 < nf); i1++)
1860 if (rms->mat[i1][i2])
1862 rms->mat[i1][i2] = rms->maxrms;
1868 fp = opt2FILE("-o", NFILE, fnm, "w");
1869 fprintf(stderr, "Writing rms distance/clustering matrix ");
1872 write_xpm(fp, 0, readmat[0].title, readmat[0].legend, readmat[0].label_x,
1873 readmat[0].label_y, nf, nf, readmat[0].axis_x, readmat[0].axis_y,
1874 rms->mat, 0.0, rms->maxrms, rlo_top, rhi_top, &nlevels);
1878 sprintf(buf, "Time (%s)", output_env_get_time_unit(oenv));
1879 sprintf(title, "RMS%sDeviation / Cluster Index",
1880 bRMSdist ? " Distance " : " ");
1883 write_xpm_split(fp, 0, title, "RMSD (nm)", buf, buf,
1884 nf, nf, time, time, rms->mat, 0.0, rms->maxrms, &nlevels,
1885 rlo_top, rhi_top, 0.0, (real) ncluster,
1886 &ncluster, TRUE, rlo_bot, rhi_bot);
1890 write_xpm(fp, 0, title, "RMSD (nm)", buf, buf,
1891 nf, nf, time, time, rms->mat, 0.0, rms->maxrms,
1892 rlo_top, rhi_top, &nlevels);
1895 fprintf(stderr, "\n");
1898 /* now show what we've done */
1899 do_view(oenv, opt2fn("-o", NFILE, fnm), "-nxy");
1900 do_view(oenv, opt2fn_null("-sz", NFILE, fnm), "-nxy");
1901 if (method == m_diagonalize)
1903 do_view(oenv, opt2fn_null("-ev", NFILE, fnm), "-nxy");
1905 do_view(oenv, opt2fn("-dist", NFILE, fnm), "-nxy");
1908 do_view(oenv, opt2fn_null("-tr", NFILE, fnm), "-nxy");
1909 do_view(oenv, opt2fn_null("-ntr", NFILE, fnm), "-nxy");
1910 do_view(oenv, opt2fn_null("-clid", NFILE, fnm), "-nxy");
1913 /* Thank the user for her patience */