2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "statistics.h"
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/utility/fatalerror.h"
46 #include "gromacs/utility/real.h"
47 #include "gromacs/utility/smalloc.h"
49 static int gmx_dnint(double x)
51 return gmx::roundToInt(x);
54 typedef struct gmx_stats {
55 double aa, a, b, sigma_aa, sigma_a, sigma_b, aver, sigma_aver, error;
56 double rmsd, Rdata, Rfit, Rfitaa, chi2, chi2aa;
57 double *x, *y, *dx, *dy;
62 gmx_stats_t gmx_stats_init()
68 return static_cast<gmx_stats_t>(stats);
71 int gmx_stats_get_npoints(gmx_stats_t gstats, int *N)
73 gmx_stats *stats = static_cast<gmx_stats *>(gstats);
80 void gmx_stats_free(gmx_stats_t gstats)
82 gmx_stats *stats = static_cast<gmx_stats *>(gstats);
91 int gmx_stats_add_point(gmx_stats_t gstats, double x, double y,
94 gmx_stats *stats = gstats;
96 if (stats->np+1 >= stats->nalloc)
98 if (stats->nalloc == 0)
100 stats->nalloc = 1024;
106 srenew(stats->x, stats->nalloc);
107 srenew(stats->y, stats->nalloc);
108 srenew(stats->dx, stats->nalloc);
109 srenew(stats->dy, stats->nalloc);
110 for (int i = stats->np; (i < stats->nalloc); i++)
118 stats->x[stats->np] = x;
119 stats->y[stats->np] = y;
120 stats->dx[stats->np] = dx;
121 stats->dy[stats->np] = dy;
128 int gmx_stats_get_point(gmx_stats_t gstats, real *x, real *y,
129 real *dx, real *dy, real level)
131 gmx_stats *stats = gstats;
135 if ((ok = gmx_stats_get_rmsd(gstats, &rmsd)) != estatsOK)
140 while ((outlier == 0) && (stats->np_c < stats->np))
142 r = std::abs(stats->x[stats->np_c] - stats->y[stats->np_c]);
143 outlier = static_cast<int>(r > rmsd*level);
148 *x = stats->x[stats->np_c];
152 *y = stats->y[stats->np_c];
156 *dx = stats->dx[stats->np_c];
160 *dy = stats->dy[stats->np_c];
173 return estatsNO_POINTS;
176 int gmx_stats_add_points(gmx_stats_t gstats, int n, real *x, real *y,
179 for (int i = 0; (i < n); i++)
182 if ((ok = gmx_stats_add_point(gstats, x[i], y[i],
183 (nullptr != dx) ? dx[i] : 0,
184 (nullptr != dy) ? dy[i] : 0)) != estatsOK)
192 static int gmx_stats_compute(gmx_stats *stats, int weight)
194 double yy, yx, xx, sx, sy, dy, chi2, chi2aa, d2;
195 double ssxx, ssyy, ssxy;
196 double w, wtot, yx_nw, sy_nw, sx_nw, yy_nw, xx_nw, dx2, dy2;
200 if (stats->computed == 0)
204 return estatsNO_POINTS;
214 for (int i = 0; (i < N); i++)
216 d2 += gmx::square(stats->x[i]-stats->y[i]);
217 if (((stats->dy[i]) != 0.0) && (weight == elsqWEIGHT_Y))
219 w = 1/gmx::square(stats->dy[i]);
228 xx += w*gmx::square(stats->x[i]);
229 xx_nw += gmx::square(stats->x[i]);
231 yy += w*gmx::square(stats->y[i]);
232 yy_nw += gmx::square(stats->y[i]);
234 yx += w*stats->y[i]*stats->x[i];
235 yx_nw += stats->y[i]*stats->x[i];
238 sx_nw += stats->x[i];
241 sy_nw += stats->y[i];
244 /* Compute average, sigma and error */
245 stats->aver = sy_nw/N;
246 stats->sigma_aver = std::sqrt(yy_nw/N - gmx::square(sy_nw/N));
247 stats->error = stats->sigma_aver/std::sqrt(static_cast<double>(N));
249 /* Compute RMSD between x and y */
250 stats->rmsd = std::sqrt(d2/N);
252 /* Correlation coefficient for data */
258 ssxx = N*(xx_nw - gmx::square(sx_nw));
259 ssyy = N*(yy_nw - gmx::square(sy_nw));
260 ssxy = N*(yx_nw - (sx_nw*sy_nw));
261 stats->Rdata = std::sqrt(gmx::square(ssxy)/(ssxx*ssyy));
263 /* Compute straight line through datapoints, either with intercept
264 zero (result in aa) or with intercept variable (results in a
272 stats->a = (yx-sx*sy)/(xx-sx*sx);
273 stats->b = (sy)-(stats->a)*(sx);
275 /* Compute chi2, deviation from a line y = ax+b. Also compute
276 chi2aa which returns the deviation from a line y = ax. */
279 for (int i = 0; (i < N); i++)
281 if (stats->dy[i] > 0)
289 chi2aa += gmx::square((stats->y[i]-(stats->aa*stats->x[i]))/dy);
290 chi2 += gmx::square((stats->y[i]-(stats->a*stats->x[i]+stats->b))/dy);
294 stats->chi2 = std::sqrt(chi2/(N-2));
295 stats->chi2aa = std::sqrt(chi2aa/(N-2));
297 /* Look up equations! */
300 stats->sigma_a = std::sqrt(stats->chi2/((N-2)*dx2));
301 stats->sigma_b = stats->sigma_a*std::sqrt(xx);
302 stats->Rfit = std::abs(ssxy)/std::sqrt(ssxx*ssyy);
303 stats->Rfitaa = stats->aa*std::sqrt(dx2/dy2);
321 int gmx_stats_get_ab(gmx_stats_t gstats, int weight,
322 real *a, real *b, real *da, real *db,
323 real *chi2, real *Rfit)
325 gmx_stats *stats = gstats;
328 if ((ok = gmx_stats_compute(stats, weight)) != estatsOK)
342 *da = stats->sigma_a;
346 *db = stats->sigma_b;
360 int gmx_stats_get_a(gmx_stats_t gstats, int weight, real *a, real *da,
361 real *chi2, real *Rfit)
363 gmx_stats *stats = gstats;
366 if ((ok = gmx_stats_compute(stats, weight)) != estatsOK)
376 *da = stats->sigma_aa;
380 *chi2 = stats->chi2aa;
384 *Rfit = stats->Rfitaa;
390 int gmx_stats_get_average(gmx_stats_t gstats, real *aver)
392 gmx_stats *stats = gstats;
395 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
405 int gmx_stats_get_ase(gmx_stats_t gstats, real *aver, real *sigma, real *error)
407 gmx_stats *stats = gstats;
410 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
419 if (nullptr != sigma)
421 *sigma = stats->sigma_aver;
423 if (nullptr != error)
425 *error = stats->error;
431 int gmx_stats_get_sigma(gmx_stats_t gstats, real *sigma)
433 gmx_stats *stats = gstats;
436 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
441 *sigma = stats->sigma_aver;
446 int gmx_stats_get_error(gmx_stats_t gstats, real *error)
448 gmx_stats *stats = gstats;
451 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
456 *error = stats->error;
461 int gmx_stats_get_corr_coeff(gmx_stats_t gstats, real *R)
463 gmx_stats *stats = gstats;
466 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
476 int gmx_stats_get_rmsd(gmx_stats_t gstats, real *rmsd)
478 gmx_stats *stats = gstats;
481 if ((ok = gmx_stats_compute(stats, elsqWEIGHT_NONE)) != estatsOK)
491 int gmx_stats_dump_xy(gmx_stats_t gstats, FILE *fp)
493 gmx_stats *stats = gstats;
495 for (int i = 0; (i < stats->np); i++)
497 fprintf(fp, "%12g %12g %12g %12g\n", stats->x[i], stats->y[i],
498 stats->dx[i], stats->dy[i]);
504 int gmx_stats_remove_outliers(gmx_stats_t gstats, double level)
506 gmx_stats *stats = gstats;
507 int iter = 1, done = 0, ok;
510 while ((stats->np >= 10) && !done)
512 if ((ok = gmx_stats_get_rmsd(gstats, &rmsd)) != estatsOK)
517 for (int i = 0; (i < stats->np); )
519 r = std::abs(stats->x[i]-stats->y[i]);
522 fprintf(stderr, "Removing outlier, iter = %d, rmsd = %g, x = %g, y = %g\n",
523 iter, rmsd, stats->x[i], stats->y[i]);
526 stats->x[i] = stats->x[stats->np-1];
527 stats->y[i] = stats->y[stats->np-1];
528 stats->dx[i] = stats->dx[stats->np-1];
529 stats->dy[i] = stats->dy[stats->np-1];
545 int gmx_stats_make_histogram(gmx_stats_t gstats, real binwidth, int *nb,
546 int ehisto, int normalized, real **x, real **y)
548 gmx_stats *stats = gstats;
549 int index = 0, nbins = *nb, *nindex;
550 double minx, maxx, maxy, miny, delta, dd, minh;
552 if (((binwidth <= 0) && (nbins <= 0)) ||
553 ((binwidth > 0) && (nbins > 0)))
555 return estatsINVALID_INPUT;
559 return estatsNO_POINTS;
561 minx = maxx = stats->x[0];
562 miny = maxy = stats->y[0];
563 for (int i = 1; (i < stats->np); i++)
565 miny = (stats->y[i] < miny) ? stats->y[i] : miny;
566 maxy = (stats->y[i] > maxy) ? stats->y[i] : maxy;
567 minx = (stats->x[i] < minx) ? stats->x[i] : minx;
568 maxx = (stats->x[i] > maxx) ? stats->x[i] : maxx;
570 if (ehisto == ehistoX)
575 else if (ehisto == ehistoY)
582 return estatsINVALID_INPUT;
587 binwidth = (delta)/nbins;
591 nbins = gmx_dnint((delta)/binwidth + 0.5);
595 for (int i = 0; (i < nbins); i++)
597 (*x)[i] = minh + binwidth*(i+0.5);
605 dd = 1.0/(binwidth*stats->np);
609 for (int i = 0; (i < stats->np); i++)
611 if (ehisto == ehistoY)
613 index = static_cast<int>((stats->y[i]-miny)/binwidth);
615 else if (ehisto == ehistoX)
617 index = static_cast<int>((stats->x[i]-minx)/binwidth);
634 for (int i = 0; (i < nbins); i++)
638 (*y)[i] /= nindex[i];
647 static const char *stats_error[estatsNR] =
649 "All well in STATS land",
652 "Invalid histogram input",
654 "Not implemented yet"
657 const char *gmx_stats_message(int estats)
659 if ((estats >= 0) && (estats < estatsNR))
661 return stats_error[estats];
665 return stats_error[estatsERROR];
669 /* Old convenience functions, should be merged with the core
671 int lsq_y_ax(int n, real x[], real y[], real *a)
673 gmx_stats_t lsq = gmx_stats_init();
677 gmx_stats_add_points(lsq, n, x, y, nullptr, nullptr);
678 ok = gmx_stats_get_a(lsq, elsqWEIGHT_NONE, a, &da, &chi2, &Rfit);
684 static int low_lsq_y_ax_b(int n, const real *xr, const double *xd, real yr[],
685 real *a, real *b, real *r, real *chi2)
687 gmx_stats_t lsq = gmx_stats_init();
690 for (int i = 0; (i < n); i++)
698 else if (xr != nullptr)
704 gmx_incons("Either xd or xr has to be non-NULL in low_lsq_y_ax_b()");
707 if ((ok = gmx_stats_add_point(lsq, pt, yr[i], 0, 0)) != estatsOK)
713 ok = gmx_stats_get_ab(lsq, elsqWEIGHT_NONE, a, b, nullptr, nullptr, chi2, r);
719 int lsq_y_ax_b(int n, real x[], real y[], real *a, real *b, real *r, real *chi2)
721 return low_lsq_y_ax_b(n, x, nullptr, y, a, b, r, chi2);
724 int lsq_y_ax_b_xdouble(int n, double x[], real y[], real *a, real *b,
727 return low_lsq_y_ax_b(n, nullptr, x, y, a, b, r, chi2);
730 int lsq_y_ax_b_error(int n, real x[], real y[], real dy[],
731 real *a, real *b, real *da, real *db,
734 gmx_stats_t lsq = gmx_stats_init();
737 for (int i = 0; (i < n); i++)
739 ok = gmx_stats_add_point(lsq, x[i], y[i], 0, dy[i]);
746 ok = gmx_stats_get_ab(lsq, elsqWEIGHT_Y, a, b, da, db, chi2, r);