1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- */
3 * This file is part of the GROMACS molecular simulation package.
5 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
6 * Copyright (c) 2001-2004, The GROMACS development team,
7 * check out http://www.gromacs.org for more information.
8 * Copyright (c) 2012, by the GROMACS development team, led by
9 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
10 * others, as listed in the AUTHORS file in the top-level source
11 * directory and at http://www.gromacs.org.
13 * GROMACS is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU Lesser General Public License
15 * as published by the Free Software Foundation; either version 2.1
16 * of the License, or (at your option) any later version.
18 * GROMACS is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * Lesser General Public License for more details.
23 * You should have received a copy of the GNU Lesser General Public
24 * License along with GROMACS; if not, see
25 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
26 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 * If you want to redistribute modifications to GROMACS, please
29 * consider that scientific software is very special. Version
30 * control is crucial - bugs must be traceable. We will be happy to
31 * consider code for inclusion in the official distribution, but
32 * derived work must not be called official GROMACS. Details are found
33 * in the README & COPYING files - if they are missing, get the
34 * official version at http://www.gromacs.org.
36 * To help us fund GROMACS development, we humbly ask that you cite
37 * the research papers on the package. Check out http://www.gromacs.org.
47 #include "gmx_statistics.h"
49 static int gmx_dnint(double x)
54 typedef struct gmx_stats {
55 double aa,a,b,sigma_aa,sigma_a,sigma_b,aver,sigma_aver,error;
56 double rmsd,Rdata,Rfit,Rfitaa,chi2,chi2aa;
62 gmx_stats_t gmx_stats_init()
68 return (gmx_stats_t) stats;
71 int gmx_stats_get_npoints(gmx_stats_t gstats, int *N)
73 gmx_stats *stats = (gmx_stats *) gstats;
80 int gmx_stats_done(gmx_stats_t gstats)
82 gmx_stats *stats = (gmx_stats *) gstats;
96 int gmx_stats_add_point(gmx_stats_t gstats,double x,double y,
99 gmx_stats *stats = (gmx_stats *) gstats;
102 if (stats->np+1 >= stats->nalloc)
104 if (stats->nalloc == 0)
105 stats->nalloc = 1024;
108 srenew(stats->x,stats->nalloc);
109 srenew(stats->y,stats->nalloc);
110 srenew(stats->dx,stats->nalloc);
111 srenew(stats->dy,stats->nalloc);
112 for(i=stats->np; (i<stats->nalloc); i++)
120 stats->x[stats->np] = x;
121 stats->y[stats->np] = y;
122 stats->dx[stats->np] = dx;
123 stats->dy[stats->np] = dy;
130 int gmx_stats_get_point(gmx_stats_t gstats,real *x,real *y,
133 gmx_stats *stats = (gmx_stats *) gstats;
135 if (stats->np_c < stats->np)
137 if (NULL != x) *x = stats->x[stats->np_c];
138 if (NULL != y) *y = stats->y[stats->np_c];
139 if (NULL != dx) *dx = stats->dx[stats->np_c];
140 if (NULL != dy) *dy = stats->dy[stats->np_c];
147 return estatsNO_POINTS;
150 int gmx_stats_add_points(gmx_stats_t gstats,int n,real *x,real *y,
157 if ((ok = gmx_stats_add_point(gstats,x[i],y[i],
158 (NULL != dx) ? dx[i] : 0,
159 (NULL != dy) ? dy[i] : 0)) != estatsOK)
167 static int gmx_stats_compute(gmx_stats *stats,int weight)
169 double yy,yx,xx,sx,sy,dy,chi2,chi2aa,d2;
170 double ssxx,ssyy,ssxy;
171 double w,wtot,yx_nw,sy_nw,sx_nw,yy_nw,xx_nw,dx2,dy2;
175 if (stats->computed == 0)
179 return estatsNO_POINTS;
191 d2 += dsqr(stats->x[i]-stats->y[i]);
192 if ((stats->dy[i]) && (weight == elsqWEIGHT_Y))
194 w = 1/dsqr(stats->dy[i]);
203 xx += w*dsqr(stats->x[i]);
204 xx_nw += dsqr(stats->x[i]);
206 yy += w*dsqr(stats->y[i]);
207 yy_nw += dsqr(stats->y[i]);
209 yx += w*stats->y[i]*stats->x[i];
210 yx_nw += stats->y[i]*stats->x[i];
213 sx_nw += stats->x[i];
216 sy_nw += stats->y[i];
219 /* Compute average, sigma and error */
220 stats->aver = sy_nw/N;
221 stats->sigma_aver = sqrt(yy_nw/N - dsqr(sy_nw/N));
222 stats->error = stats->sigma_aver/sqrt(N);
224 /* Compute RMSD between x and y */
225 stats->rmsd = sqrt(d2/N);
227 /* Correlation coefficient for data */
233 ssxx = N*(xx_nw - dsqr(sx_nw));
234 ssyy = N*(yy_nw - dsqr(sy_nw));
235 ssxy = N*(yx_nw - (sx_nw*sy_nw));
236 stats->Rdata = sqrt(dsqr(ssxy)/(ssxx*ssyy));
238 /* Compute straight line through datapoints, either with intercept
239 zero (result in aa) or with intercept variable (results in a
247 stats->a = (yx-sx*sy)/(xx-sx*sx);
248 stats->b = (sy)-(stats->a)*(sx);
250 /* Compute chi2, deviation from a line y = ax+b. Also compute
251 chi2aa which returns the deviation from a line y = ax. */
256 if (stats->dy[i] > 0)
264 chi2aa += dsqr((stats->y[i]-(stats->aa*stats->x[i]))/dy);
265 chi2 += dsqr((stats->y[i]-(stats->a*stats->x[i]+stats->b))/dy);
269 stats->chi2 = sqrt(chi2/(N-2));
270 stats->chi2aa = sqrt(chi2aa/(N-2));
272 /* Look up equations! */
275 stats->sigma_a = sqrt(stats->chi2/((N-2)*dx2));
276 stats->sigma_b = stats->sigma_a*sqrt(xx);
277 stats->Rfit = fabs(ssxy)/sqrt(ssxx*ssyy);
278 /*stats->a*sqrt(dx2/dy2);*/
279 stats->Rfitaa = stats->aa*sqrt(dx2/dy2);
297 int gmx_stats_get_ab(gmx_stats_t gstats,int weight,
298 real *a,real *b,real *da,real *db,
299 real *chi2,real *Rfit)
301 gmx_stats *stats = (gmx_stats *) gstats;
304 if ((ok = gmx_stats_compute(stats,weight)) != estatsOK)
316 *da = stats->sigma_a;
320 *db = stats->sigma_b;
334 int gmx_stats_get_a(gmx_stats_t gstats,int weight,real *a,real *da,
335 real *chi2,real *Rfit)
337 gmx_stats *stats = (gmx_stats *) gstats;
340 if ((ok = gmx_stats_compute(stats,weight)) != estatsOK)
342 if (NULL != a) *a = stats->aa;
343 if (NULL != da) *da = stats->sigma_aa;
344 if (NULL != chi2) *chi2 = stats->chi2aa;
345 if (NULL != Rfit) *Rfit = stats->Rfitaa;
350 int gmx_stats_get_average(gmx_stats_t gstats,real *aver)
352 gmx_stats *stats = (gmx_stats *) gstats;
355 if ((ok = gmx_stats_compute(stats,elsqWEIGHT_NONE)) != estatsOK)
365 int gmx_stats_get_ase(gmx_stats_t gstats,real *aver,real *sigma,real *error)
367 gmx_stats *stats = (gmx_stats *) gstats;
370 if ((ok = gmx_stats_compute(stats,elsqWEIGHT_NONE)) != estatsOK)
381 *sigma = stats->sigma_aver;
385 *error = stats->error;
391 int gmx_stats_get_sigma(gmx_stats_t gstats,real *sigma)
393 gmx_stats *stats = (gmx_stats *) gstats;
396 if ((ok = gmx_stats_compute(stats,elsqWEIGHT_NONE)) != estatsOK)
399 *sigma = stats->sigma_aver;
404 int gmx_stats_get_error(gmx_stats_t gstats,real *error)
406 gmx_stats *stats = (gmx_stats *) gstats;
409 if ((ok = gmx_stats_compute(stats,elsqWEIGHT_NONE)) != estatsOK)
412 *error = stats->error;
417 int gmx_stats_get_corr_coeff(gmx_stats_t gstats,real *R)
419 gmx_stats *stats = (gmx_stats *) gstats;
422 if ((ok = gmx_stats_compute(stats,elsqWEIGHT_NONE)) != estatsOK)
430 int gmx_stats_get_rmsd(gmx_stats_t gstats,real *rmsd)
432 gmx_stats *stats = (gmx_stats *) gstats;
435 if ((ok = gmx_stats_compute(stats,elsqWEIGHT_NONE)) != estatsOK)
445 int gmx_stats_dump_xy(gmx_stats_t gstats,FILE *fp)
447 gmx_stats *stats = (gmx_stats *) gstats;
450 for(i=0; (i<stats->np); i++)
452 fprintf(fp,"%12g %12g %12g %12g\n",stats->x[i],stats->y[i],
453 stats->dx[i],stats->dy[i]);
459 int gmx_stats_remove_outliers(gmx_stats_t gstats,double level)
461 gmx_stats *stats = (gmx_stats *) gstats;
462 int i,iter=1,done=0,ok;
465 while ((stats->np >= 10) && !done)
467 if ((ok = gmx_stats_get_rmsd(gstats,&rmsd)) != estatsOK)
472 for(i=0; (i<stats->np); )
474 r = fabs(stats->x[i]-stats->y[i]);
477 fprintf(stderr,"Removing outlier, iter = %d, rmsd = %g, x = %g, y = %g\n",
478 iter,rmsd,stats->x[i],stats->y[i]);
481 stats->x[i] = stats->x[stats->np-1];
482 stats->y[i] = stats->y[stats->np-1];
483 stats->dx[i] = stats->dx[stats->np-1];
484 stats->dy[i] = stats->dy[stats->np-1];
500 int gmx_stats_make_histogram(gmx_stats_t gstats,real binwidth,int *nb,
501 int ehisto,int normalized,real **x,real **y)
503 gmx_stats *stats = (gmx_stats *) gstats;
504 int i,ok,index=0,nbins=*nb,*nindex;
505 double minx,maxx,maxy,miny,delta,dd,minh;
507 if (((binwidth <= 0) && (nbins <= 0)) ||
508 ((binwidth > 0) && (nbins > 0)))
510 return estatsINVALID_INPUT;
514 return estatsNO_POINTS;
516 minx = maxx = stats->x[0];
517 miny = maxy = stats->y[0];
518 for(i=1; (i<stats->np); i++)
520 miny = (stats->y[i] < miny) ? stats->y[i] : miny;
521 maxy = (stats->y[i] > maxy) ? stats->y[i] : maxy;
522 minx = (stats->x[i] < minx) ? stats->x[i] : minx;
523 maxx = (stats->x[i] > maxx) ? stats->x[i] : maxx;
525 if (ehisto == ehistoX)
530 else if (ehisto == ehistoY)
536 return estatsINVALID_INPUT;
540 binwidth = (delta)/nbins;
544 nbins = gmx_dnint((delta)/binwidth + 0.5);
548 for(i=0; (i<nbins); i++)
550 (*x)[i] = minh + binwidth*(i+0.5);
558 dd = 1.0/(binwidth*stats->np);
562 for(i=0; (i<stats->np); i++)
564 if (ehisto == ehistoY)
565 index = (stats->y[i]-miny)/binwidth;
566 else if (ehisto == ehistoX)
567 index = (stats->x[i]-minx)/binwidth;
581 for(i=0; (i<nbins); i++)
583 (*y)[i] /= nindex[i];
590 static const char *stats_error[estatsNR] =
592 "All well in STATS land",
595 "Invalid histogram input",
597 "Not implemented yet"
600 const char *gmx_stats_message(int estats)
602 if ((estats >= 0) && (estats < estatsNR))
604 return stats_error[estats];
608 return stats_error[estatsERROR];
612 /* Old convenience functions, should be merged with the core
614 int lsq_y_ax(int n, real x[], real y[], real *a)
616 gmx_stats_t lsq = gmx_stats_init();
620 gmx_stats_add_points(lsq,n,x,y,0,0);
621 if ((ok = gmx_stats_get_a(lsq,elsqWEIGHT_NONE,a,&da,&chi2,&Rfit)) != estatsOK)
630 for (i=0; i<n; i++) {
639 static int low_lsq_y_ax_b(int n, real *xr, double *xd, real yr[],
640 real *a, real *b,real *r,real *chi2)
645 lsq = gmx_stats_init();
648 if ((ok = gmx_stats_add_point(lsq,(NULL != xd) ? xd[i] : xr[i],yr[i],0,0))
654 if ((ok = gmx_stats_get_ab(lsq,elsqWEIGHT_NONE,a,b,NULL,NULL,chi2,r)) != estatsOK)
661 double x,y,yx,xx,yy,sx,sy,chi2;
664 for (i=0; i<n; i++) {
678 *a = (n*yx-sy*sx)/(n*xx-sx*sx);
680 *r = sqrt((xx-sx*sx)/(yy-sy*sy));
685 chi2 += dsqr(yr[i] - ((*a)*xd[i] + (*b)));
688 chi2 += dsqr(yr[i] - ((*a)*xr[i] + (*b)));
692 return sqrt(chi2/(n-2));
698 int lsq_y_ax_b(int n, real x[], real y[], real *a, real *b,real *r,real *chi2)
700 return low_lsq_y_ax_b(n,x,NULL,y,a,b,r,chi2);
703 int lsq_y_ax_b_xdouble(int n, double x[], real y[], real *a, real *b,
706 return low_lsq_y_ax_b(n,NULL,x,y,a,b,r,chi2);
709 int lsq_y_ax_b_error(int n, real x[], real y[], real dy[],
710 real *a, real *b, real *da, real *db,
716 lsq = gmx_stats_init();
719 if ((ok = gmx_stats_add_point(lsq,x[i],y[i],0,dy[i])) != estatsOK)
724 if ((ok = gmx_stats_get_ab(lsq,elsqWEIGHT_Y,a,b,da,db,chi2,r)) != estatsOK)
728 if ((ok = gmx_stats_done(lsq)) != estatsOK)
736 double sxy,sxx,syy,sx,sy,w,s_2,dx2,dy2,mins;
738 sxy=sxx=syy=sx=sy=w=0.0;
741 mins = min(mins,dy[i]);
743 gmx_fatal(FARGS,"Zero or negative weigths in linear regression analysis");
745 for (i=0; i<n; i++) {
746 s_2 = dsqr(1.0/dy[i]);
747 sxx += s_2*dsqr(x[i]);
748 sxy += s_2*y[i]*x[i];
749 syy += s_2*dsqr(y[i]);
766 *chi2+=dsqr((y[i]-((*a)*x[i]+(*b)))/dy[i]);
769 *da = sqrt(*chi2/((n-2)*dx2));
771 *r = *a*sqrt(dx2/dy2);
774 fprintf(debug,"sx = %g, sy = %g, sxy = %g, sxx = %g, w = %g\n"
775 "chi2 = %g, dx2 = %g\n",
776 sx,sy,sxy,sxx,w,*chi2,dx2);
779 *chi2 = sqrt(*chi2/(n-2));