2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "statistics.h"
43 #include "gromacs/math/vec.h"
44 #include "gromacs/utility/real.h"
45 #include "gromacs/utility/smalloc.h"
47 static int gmx_dnint(double x)
49 return static_cast<int>(x+0.5);
52 typedef struct gmx_stats {
53 double aa, a, b, sigma_aa, sigma_a, sigma_b, aver, sigma_aver, error;
54 double rmsd, Rdata, Rfit, Rfitaa, chi2, chi2aa;
55 double *x, *y, *dx, *dy;
60 gmx_stats_t gmx_stats_init()
66 return (gmx_stats_t) stats;
69 int gmx_stats_get_npoints(gmx_stats_t gstats, int *N)
71 gmx_stats *stats = (gmx_stats *) gstats;
78 int gmx_stats_done(gmx_stats_t gstats)
80 gmx_stats *stats = (gmx_stats *) gstats;
94 int gmx_stats_add_point(gmx_stats_t gstats, double x, double y,
97 gmx_stats *stats = gstats;
99 if (stats->np+1 >= stats->nalloc)
101 if (stats->nalloc == 0)
103 stats->nalloc = 1024;
109 srenew(stats->x, stats->nalloc);
110 srenew(stats->y, stats->nalloc);
111 srenew(stats->dx, stats->nalloc);
112 srenew(stats->dy, stats->nalloc);
113 for (int i = stats->np; (i < stats->nalloc); i++)
121 stats->x[stats->np] = x;
122 stats->y[stats->np] = y;
123 stats->dx[stats->np] = dx;
124 stats->dy[stats->np] = dy;
131 int gmx_stats_get_point(gmx_stats_t gstats, real *x, real *y,
132 real *dx, real *dy, real level)
134 gmx_stats *stats = gstats;
138 if ((ok = gmx_stats_get_rmsd(gstats, &rmsd)) != estatsOK)
143 while ((outlier == 0) && (stats->np_c < stats->np))
145 r = std::abs(stats->x[stats->np_c] - stats->y[stats->np_c]);
146 outlier = (r > rmsd*level);
151 *x = stats->x[stats->np_c];
155 *y = stats->y[stats->np_c];
159 *dx = stats->dx[stats->np_c];
163 *dy = stats->dy[stats->np_c];
176 return estatsNO_POINTS;
179 int gmx_stats_add_points(gmx_stats_t gstats, int n, real *x, real *y,
182 for (int i = 0; (i < n); i++)
185 if ((ok = gmx_stats_add_point(gstats, x[i], y[i],
186 (NULL != dx) ? dx[i] : 0,
187 (NULL != dy) ? dy[i] : 0)) != estatsOK)
195 static int gmx_stats_compute(gmx_stats *stats, int weight)
197 double yy, yx, xx, sx, sy, dy, chi2, chi2aa, d2;
198 double ssxx, ssyy, ssxy;
199 double w, wtot, yx_nw, sy_nw, sx_nw, yy_nw, xx_nw, dx2, dy2;
203 if (stats->computed == 0)
207 return estatsNO_POINTS;
217 for (int i = 0; (i < N); i++)
219 d2 += dsqr(stats->x[i]-stats->y[i]);
220 if ((stats->dy[i]) && (weight == elsqWEIGHT_Y))
222 w = 1/dsqr(stats->dy[i]);
231 xx += w*dsqr(stats->x[i]);
232 xx_nw += dsqr(stats->x[i]);
234 yy += w*dsqr(stats->y[i]);
235 yy_nw += dsqr(stats->y[i]);
237 yx += w*stats->y[i]*stats->x[i];
238 yx_nw += stats->y[i]*stats->x[i];
241 sx_nw += stats->x[i];
244 sy_nw += stats->y[i];
247 /* Compute average, sigma and error */
248 stats->aver = sy_nw/N;
249 stats->sigma_aver = std::sqrt(yy_nw/N - dsqr(sy_nw/N));
250 stats->error = stats->sigma_aver/std::sqrt(static_cast<double>(N));
252 /* Compute RMSD between x and y */
253 stats->rmsd = std::sqrt(d2/N);
255 /* Correlation coefficient for data */
261 ssxx = N*(xx_nw - dsqr(sx_nw));
262 ssyy = N*(yy_nw - dsqr(sy_nw));
263 ssxy = N*(yx_nw - (sx_nw*sy_nw));
264 stats->Rdata = std::sqrt(dsqr(ssxy)/(ssxx*ssyy));
266 /* Compute straight line through datapoints, either with intercept
267 zero (result in aa) or with intercept variable (results in a
275 stats->a = (yx-sx*sy)/(xx-sx*sx);
276 stats->b = (sy)-(stats->a)*(sx);
278 /* Compute chi2, deviation from a line y = ax+b. Also compute
279 chi2aa which returns the deviation from a line y = ax. */
282 for (int i = 0; (i < N); i++)
284 if (stats->dy[i] > 0)
292 chi2aa += dsqr((stats->y[i]-(stats->aa*stats->x[i]))/dy);
293 chi2 += dsqr((stats->y[i]-(stats->a*stats->x[i]+stats->b))/dy);
297 stats->chi2 = std::sqrt(chi2/(N-2));
298 stats->chi2aa = std::sqrt(chi2aa/(N-2));
300 /* Look up equations! */
303 stats->sigma_a = std::sqrt(stats->chi2/((N-2)*dx2));
304 stats->sigma_b = stats->sigma_a*std::sqrt(xx);
305 stats->Rfit = std::abs(ssxy)/std::sqrt(ssxx*ssyy);
306 stats->Rfitaa = stats->aa*std::sqrt(dx2/dy2);
324 int gmx_stats_get_ab(gmx_stats_t gstats, int weight,
325 real *a, real *b, real *da, real *db,
326 real *chi2, real *Rfit)
328 gmx_stats *stats = gstats;
331 if ((ok = gmx_stats_compute(stats, weight)) != estatsOK)
345 *da = stats->sigma_a;
349 *db = stats->sigma_b;
363 int gmx_stats_get_a(gmx_stats_t gstats, int weight, real *a, real *da,
364 real *chi2, real *Rfit)
366 gmx_stats *stats = gstats;
369 if ((ok = gmx_stats_compute(stats, weight)) != estatsOK)
379 *da = stats->sigma_aa;
383 *chi2 = stats->chi2aa;
387 *Rfit = stats->Rfitaa;
393 int gmx_stats_get_average(gmx_stats_t gstats, real *aver)
395 gmx_stats *stats = gstats;
398 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
408 int gmx_stats_get_ase(gmx_stats_t gstats, real *aver, real *sigma, real *error)
410 gmx_stats *stats = gstats;
413 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
424 *sigma = stats->sigma_aver;
428 *error = stats->error;
434 int gmx_stats_get_sigma(gmx_stats_t gstats, real *sigma)
436 gmx_stats *stats = gstats;
439 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
444 *sigma = stats->sigma_aver;
449 int gmx_stats_get_error(gmx_stats_t gstats, real *error)
451 gmx_stats *stats = gstats;
454 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
459 *error = stats->error;
464 int gmx_stats_get_corr_coeff(gmx_stats_t gstats, real *R)
466 gmx_stats *stats = gstats;
469 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
479 int gmx_stats_get_rmsd(gmx_stats_t gstats, real *rmsd)
481 gmx_stats *stats = gstats;
484 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
494 int gmx_stats_dump_xy(gmx_stats_t gstats, FILE *fp)
496 gmx_stats *stats = gstats;
498 for (int i = 0; (i < stats->np); i++)
500 fprintf(fp, "%12g %12g %12g %12g\n", stats->x[i], stats->y[i],
501 stats->dx[i], stats->dy[i]);
507 int gmx_stats_remove_outliers(gmx_stats_t gstats, double level)
509 gmx_stats *stats = gstats;
510 int iter = 1, done = 0, ok;
513 while ((stats->np >= 10) && !done)
515 if ((ok = gmx_stats_get_rmsd(gstats, &rmsd)) != estatsOK)
520 for (int i = 0; (i < stats->np); )
522 r = std::abs(stats->x[i]-stats->y[i]);
525 fprintf(stderr, "Removing outlier, iter = %d, rmsd = %g, x = %g, y = %g\n",
526 iter, rmsd, stats->x[i], stats->y[i]);
529 stats->x[i] = stats->x[stats->np-1];
530 stats->y[i] = stats->y[stats->np-1];
531 stats->dx[i] = stats->dx[stats->np-1];
532 stats->dy[i] = stats->dy[stats->np-1];
548 int gmx_stats_make_histogram(gmx_stats_t gstats, real binwidth, int *nb,
549 int ehisto, int normalized, real **x, real **y)
551 gmx_stats *stats = gstats;
552 int index = 0, nbins = *nb, *nindex;
553 double minx, maxx, maxy, miny, delta, dd, minh;
555 if (((binwidth <= 0) && (nbins <= 0)) ||
556 ((binwidth > 0) && (nbins > 0)))
558 return estatsINVALID_INPUT;
562 return estatsNO_POINTS;
564 minx = maxx = stats->x[0];
565 miny = maxy = stats->y[0];
566 for (int i = 1; (i < stats->np); i++)
568 miny = (stats->y[i] < miny) ? stats->y[i] : miny;
569 maxy = (stats->y[i] > maxy) ? stats->y[i] : maxy;
570 minx = (stats->x[i] < minx) ? stats->x[i] : minx;
571 maxx = (stats->x[i] > maxx) ? stats->x[i] : maxx;
573 if (ehisto == ehistoX)
578 else if (ehisto == ehistoY)
585 return estatsINVALID_INPUT;
590 binwidth = (delta)/nbins;
594 nbins = gmx_dnint((delta)/binwidth + 0.5);
598 for (int i = 0; (i < nbins); i++)
600 (*x)[i] = minh + binwidth*(i+0.5);
608 dd = 1.0/(binwidth*stats->np);
612 for (int i = 0; (i < stats->np); i++)
614 if (ehisto == ehistoY)
616 index = static_cast<int>((stats->y[i]-miny)/binwidth);
618 else if (ehisto == ehistoX)
620 index = static_cast<int>((stats->x[i]-minx)/binwidth);
637 for (int i = 0; (i < nbins); i++)
641 (*y)[i] /= nindex[i];
650 static const char *stats_error[estatsNR] =
652 "All well in STATS land",
655 "Invalid histogram input",
657 "Not implemented yet"
660 const char *gmx_stats_message(int estats)
662 if ((estats >= 0) && (estats < estatsNR))
664 return stats_error[estats];
668 return stats_error[estatsERROR];
672 /* Old convenience functions, should be merged with the core
674 int lsq_y_ax(int n, real x[], real y[], real *a)
676 gmx_stats_t lsq = gmx_stats_init();
680 gmx_stats_add_points(lsq, n, x, y, 0, 0);
681 if ((ok = gmx_stats_get_a(lsq, elsqWEIGHT_NONE, a, &da, &chi2, &Rfit)) != estatsOK)
689 static int low_lsq_y_ax_b(int n, real *xr, double *xd, real yr[],
690 real *a, real *b, real *r, real *chi2)
692 gmx_stats_t lsq = gmx_stats_init();
695 for (int i = 0; (i < n); i++)
709 gmx_incons("Either xd or xr has to be non-NULL in low_lsq_y_ax_b()");
712 if ((ok = gmx_stats_add_point(lsq, pt, yr[i], 0, 0)) != estatsOK)
717 if ((ok = gmx_stats_get_ab(lsq, elsqWEIGHT_NONE, a, b, NULL, NULL, chi2, r)) != estatsOK)
725 int lsq_y_ax_b(int n, real x[], real y[], real *a, real *b, real *r, real *chi2)
727 return low_lsq_y_ax_b(n, x, NULL, y, a, b, r, chi2);
730 int lsq_y_ax_b_xdouble(int n, double x[], real y[], real *a, real *b,
733 return low_lsq_y_ax_b(n, NULL, x, y, a, b, r, chi2);
736 int lsq_y_ax_b_error(int n, real x[], real y[], real dy[],
737 real *a, real *b, real *da, real *db,
740 gmx_stats_t lsq = gmx_stats_init();
743 for (int i = 0; (i < n); i++)
745 if ((ok = gmx_stats_add_point(lsq, x[i], y[i], 0, dy[i])) != estatsOK)
750 if ((ok = gmx_stats_get_ab(lsq, elsqWEIGHT_Y, a, b, da, db, chi2, r)) != estatsOK)
754 if ((ok = gmx_stats_done(lsq)) != estatsOK)