ex-1: implement a KDE-based estimation of the FWHM

This commit is contained in:
Michele Guerini Rocco 2020-04-08 12:33:11 +02:00
parent 6ad77013b6
commit 29dda3e151
5 changed files with 179 additions and 26 deletions

View File

@ -1,10 +1,12 @@
#include <stdlib.h> #include <stdlib.h>
#include <math.h> #include <math.h>
#include <gsl/gsl_roots.h>
#include <gsl/gsl_rstat.h> #include <gsl/gsl_rstat.h>
#include <gsl/gsl_vector.h> #include <gsl/gsl_vector.h>
#include <gsl/gsl_statistics_double.h> #include <gsl/gsl_statistics_double.h>
#include "bootstrap.h" #include "bootstrap.h"
#include "tests.h"
/* Function that compares doubles for sorting: /* Function that compares doubles for sorting:
@ -101,6 +103,42 @@ double hsm(double *x, size_t n) {
} }
/* Parameters needed to compute the
* KDE function of a sample.
*/
struct kde_params {
double n; // sample size
double var; // sample variance
double *sample; // sample
};
/* A KDE (kernel density estimation) is an
* approximation of a sample PDF obtained by
* convolving the sample points with a smooth
* kernel, in this case a standard gaussian.
*
* `gauss_kde(x, p)` gives the value of the PDF
* at `x`, where `p` is a `kde_params` struct.
*/
double gauss_kde(double x, void * params) {
struct kde_params p = *((struct kde_params*) params);
/* Apply the Silverman's rule of thumb to the
* bandwidth estimation. The badwidth is given
* by the sample variance times a factor which
* depends on the number of points and dimension.
*/
double bw = p.var * pow((double)p.n*3.0/4, -2.0/5);
double sum = 0;
for (size_t i = 0; i < p.n; i++)
sum += exp(-pow(x - p.sample[i], 2) / (2*bw));
return sum / sqrt(2*M_PI*bw) / p.n;
}
/* Computes an approximation to the asymptotic median /* Computes an approximation to the asymptotic median
* and its standard deviation by bootstrapping (ie * and its standard deviation by bootstrapping (ie
* repeated resampling) the original `sample`, `boots` * repeated resampling) the original `sample`, `boots`
@ -188,3 +226,78 @@ uncert bootstrap_mode(
return mode; return mode;
} }
/* Computes an approximation to the asymptotic fwhm
* and its standard deviation by bootstrapping (ie
* repeated resampling) the original `sample`, `boots`
* times.
*
* `min,max` are the bounds of interval that is
* expected to contain the mode of the sample.
*
* The functions returns an `uncert` pair of mean and
* stddev of the fwhms computed on each sample.
*/
uncert bootstrap_fwhm(
const gsl_rng *r,
double min, double max,
double *sample, size_t n,
int boots) {
/* We use a running statistics to compute
* a trimmed mean/variance of the sample.
*/
gsl_rstat_workspace* w = gsl_rstat_alloc();
// fwhm values and resample
double *values = calloc(boots, sizeof(double));
double *boot = calloc(n, sizeof(double));
// set the size here because it's fixed
struct kde_params p;
p.n = n;
// struct for numeric_fwhm
gsl_function pdf;
pdf.function = &gauss_kde;
for (size_t i = 0; i < boots; i++) {
gsl_rstat_reset(w);
/* The sampling is simply done by generating
* an array index uniformely in [0, n-1].
*/
for (size_t j = 0; j < n; j++) {
double x = sample[gsl_rng_uniform_int(r, n)];
boot[j] = x;
/* Remove too extreme values from the
* the trimmed statistics workspace.
*/
if (fabs(x) < 10) gsl_rstat_add(x, w);
}
/* Set the trimmed variance of the sample as
* the (uncorrected) variance of the gaussian
* kernel estimator.
*/
p.sample = boot;
p.var = gsl_rstat_variance(w);
pdf.params = &p;
values[i] = numeric_fwhm(min, max, &pdf, 0);
}
/* Compute mean and stddev of the fwhms
* of each newly bootstrapped sample.
*/
uncert fwhm;
fwhm.n = gsl_stats_mean(values, 1, boots);
fwhm.s = gsl_stats_sd(values, 1, boots);
// free memory
free(values);
free(boot);
gsl_rstat_free(w);
return fwhm;
}

View File

@ -45,3 +45,21 @@ uncert bootstrap_mode(
const gsl_rng *r, const gsl_rng *r,
double *sample, size_t n, double *sample, size_t n,
int boots); int boots);
/* Computes an approximation to the asymptotic fwhm
* and its standard deviation by bootstrapping (ie
* repeated resampling) the original `sample`, `boots`
* times.
*
* `min,max` are the bounds of interval that is
* expected to contain the mode of the sample.
*
* The functions returns an `uncert` pair of mean and
* stddev of the fwhms computed on each sample.
*/
uncert bootstrap_fwhm(
const gsl_rng *r,
double min, double max,
double *sample, size_t n,
int boots);

View File

@ -24,8 +24,8 @@ int main(int argc, char** argv) {
size_t samples = 100000; size_t samples = 100000;
double* sample = calloc(samples, sizeof(double)); double* sample = calloc(samples, sizeof(double));
size_t bins = 40; size_t bins = 40;
double min = -20; double min = -10;
double max = 20; double max = 10;
gsl_histogram* hist = gsl_histogram_alloc(bins); gsl_histogram* hist = gsl_histogram_alloc(bins);
gsl_histogram_set_ranges_uniform(hist, min, max); gsl_histogram_set_ranges_uniform(hist, min, max);
@ -86,7 +86,7 @@ int main(int argc, char** argv) {
pdf.function = &landau_pdf; pdf.function = &landau_pdf;
pdf.params = NULL; pdf.params = NULL;
double mode_e = numeric_mode(min, max, &pdf); double mode_e = numeric_mode(min, max, &pdf, 1);
uncert mode_o = bootstrap_mode(r, sample, samples, 100); uncert mode_o = bootstrap_mode(r, sample, samples, 100);
// print the results // print the results
@ -94,6 +94,7 @@ int main(int argc, char** argv) {
fprintf(stderr, "expected mode: %.7f\n", mode_e); fprintf(stderr, "expected mode: %.7f\n", mode_e);
fprintf(stderr, "observed mode: %.4f±%.4f\n", mode_o.n, mode_o.s); fprintf(stderr, "observed mode: %.4f±%.4f\n", mode_o.n, mode_o.s);
// t-test
double t = fabs(mode_e - mode_o.n)/mode_o.s; double t = fabs(mode_e - mode_o.n)/mode_o.s;
double p = 1 - erf(t/sqrt(2)); double p = 1 - erf(t/sqrt(2));
fprintf(stderr, "\n## t-test\n"); fprintf(stderr, "\n## t-test\n");
@ -112,16 +113,17 @@ int main(int argc, char** argv) {
*/ */
fprintf(stderr, "\n\n# FWHM comparison\n"); fprintf(stderr, "\n\n# FWHM comparison\n");
double fwhm_e = numeric_fwhm(min, max, &pdf); double fwhm_e = numeric_fwhm(min, max, &pdf, 1);
//uncert fwhm_o = bootstrap_fwhm(r, sample, samples, 100); uncert fwhm_o = bootstrap_fwhm(r, min, max, sample, samples, 100);
// print the results // print the results
fprintf(stderr, "\n## Results\n"); fprintf(stderr, "\n## Results\n");
fprintf(stderr, "expected fwhm: %.7f\n", fwhm_e); fprintf(stderr, "expected fwhm: %.7f\n", fwhm_e);
//fprintf(stderr, "observed fwhm: %.4f±%.4f\n", fwhm_o.n, fwhm_o.s); fprintf(stderr, "observed fwhm: %.4f±%.4f\n", fwhm_o.n, fwhm_o.s);
//t = fabs(fwhm_e - fwhm_o.n)/fwhm_o.s; // t-test
//p = 1 - erf(t/sqrt(2)); t = fabs(fwhm_e - fwhm_o.n)/fwhm_o.s;
p = 1 - erf(t/sqrt(2));
fprintf(stderr, "\n## t-test\n"); fprintf(stderr, "\n## t-test\n");
fprintf(stderr, "t=%.3f\n", t); fprintf(stderr, "t=%.3f\n", t);
fprintf(stderr, "p=%.3f\n", p); fprintf(stderr, "p=%.3f\n", p);
@ -141,6 +143,7 @@ int main(int argc, char** argv) {
fprintf(stderr, "expected median: %.7f\n", med_e); fprintf(stderr, "expected median: %.7f\n", med_e);
fprintf(stderr, "observed median: %.4f±%.4f\n", med_o.n, med_o.s); fprintf(stderr, "observed median: %.4f±%.4f\n", med_o.n, med_o.s);
// t-test
t = fabs(med_e - med_o.n)/med_o.s; t = fabs(med_e - med_o.n)/med_o.s;
p = 1 - erf(t/sqrt(2)); p = 1 - erf(t/sqrt(2));
fprintf(stderr, "\n## t-test\n"); fprintf(stderr, "\n## t-test\n");

View File

@ -74,11 +74,14 @@ double negate_func(double x, void * fp) {
/* Numerically computes the mode of a Landau /* Numerically computes the mode of a Landau
* distribution by maximising the derivative. * distribution by maximising the derivative.
* The min,max parameters are the initial search * The `min,max` parameters are the initial search
* interval for the optimisation. * interval for the optimisation.
*
* If `err` is true print the estimate error.
*/ */
double numeric_mode(double min, double max, double numeric_mode(double min, double max,
gsl_function *pdf) { gsl_function *pdf,
int err) {
/* Negate the PDF to maximise it by /* Negate the PDF to maximise it by
* using a GSL minimisation method. * using a GSL minimisation method.
@ -89,22 +92,22 @@ double numeric_mode(double min, double max,
npdf.params = pdf; npdf.params = pdf;
// initialize minimization // initialize minimization
double guess = 0; double x = 0;
int iter = 0; int iter = 0;
int max_iter = 100; int max_iter = 100;
double prec = 1e-7; double prec = 1e-7;
int status; int status;
const gsl_min_fminimizer_type * T = gsl_min_fminimizer_brent; const gsl_min_fminimizer_type *T = gsl_min_fminimizer_brent;
gsl_min_fminimizer * s = gsl_min_fminimizer_alloc(T); gsl_min_fminimizer *s = gsl_min_fminimizer_alloc(T);
gsl_min_fminimizer_set(s, &npdf, guess, min, max); gsl_min_fminimizer_set(s, &npdf, x, min, max);
// minimisation // minimisation
do { do {
iter++; iter++;
status = gsl_min_fminimizer_iterate(s); status = gsl_min_fminimizer_iterate(s);
guess = gsl_min_fminimizer_x_minimum(s); x = gsl_min_fminimizer_x_minimum(s);
min = gsl_min_fminimizer_x_lower(s); min = gsl_min_fminimizer_x_lower(s);
max = gsl_min_fminimizer_x_upper(s); max = gsl_min_fminimizer_x_upper(s);
status = gsl_min_test_interval(min, max, prec, prec); status = gsl_min_test_interval(min, max, prec, prec);
} while (status == GSL_CONTINUE && iter < max_iter); } while (status == GSL_CONTINUE && iter < max_iter);
@ -112,11 +115,12 @@ double numeric_mode(double min, double max,
/* The error is simply given by the width of /* The error is simply given by the width of
* the final interval containing the solution * the final interval containing the solution
*/ */
fprintf(stderr, "mode error: %.3g\n", max - min); if (err)
fprintf(stderr, "mode error: %.3g\n", max - min);
// free memory // free memory
gsl_min_fminimizer_free(s); gsl_min_fminimizer_free(s);
return guess; return x;
} }
@ -154,13 +158,16 @@ double fwhm_equation(double x, void* params) {
* `fwhm_equation`. Two searches are performed in * `fwhm_equation`. Two searches are performed in
* [min, mode] and [mode, max] that will give the * [min, mode] and [mode, max] that will give the
* two solutions x, x. The FWHM is then x-x. * two solutions x, x. The FWHM is then x-x.
*
* If `err` is true print the estimate error.
*/ */
double numeric_fwhm(double min, double max, double numeric_fwhm(double min, double max,
gsl_function *pdf) { gsl_function *pdf,
int err) {
/* Create the gls_function structure /* Create the gls_function structure
* for the equation to be solved. * for the equation to be solved.
*/ */
double mode = numeric_mode(min, max, pdf); double mode = numeric_mode(min, max, pdf, 0);
struct fwhm_params p = { struct fwhm_params p = {
pdf->function(mode, pdf->params)/2, pdf->function(mode, pdf->params)/2,
pdf pdf
@ -169,8 +176,8 @@ double numeric_fwhm(double min, double max,
equation.function = &fwhm_equation; equation.function = &fwhm_equation;
equation.params = &p; equation.params = &p;
const gsl_root_fsolver_type * T = gsl_root_fsolver_brent; const gsl_root_fsolver_type *T = gsl_root_fsolver_brent;
gsl_root_fsolver * s = gsl_root_fsolver_alloc(T); gsl_root_fsolver *s = gsl_root_fsolver_alloc(T);
// initialize minimization for x₋ // initialize minimization for x₋
double x, fmin, fmax; double x, fmin, fmax;
@ -189,6 +196,7 @@ double numeric_fwhm(double min, double max,
status = gsl_min_test_interval(fmin, fmax, 0, prec); status = gsl_min_test_interval(fmin, fmax, 0, prec);
} }
double x_low = x; double x_low = x;
double err_low = fmax - fmin;
// initialize minimization for x₊ // initialize minimization for x₊
gsl_root_fsolver_set(s, &equation, mode, max); gsl_root_fsolver_set(s, &equation, mode, max);
@ -203,6 +211,11 @@ double numeric_fwhm(double min, double max,
status = gsl_min_test_interval(fmin, fmax, 0, prec); status = gsl_min_test_interval(fmin, fmax, 0, prec);
} }
double x_upp = x; double x_upp = x;
double err_upp = fmax - fmin;
if (err)
fprintf(stderr, "fhwm error: %g\n",
sqrt(err_low*err_low + err_upp*err_upp));
// free memory // free memory
gsl_root_fsolver_free(s); gsl_root_fsolver_free(s);

View File

@ -15,11 +15,14 @@ double kolmogorov_cdf(double D, int n);
/* Numerically computes the mode of a Landau /* Numerically computes the mode of a Landau
* distribution by maximising the derivative. * distribution by maximising the derivative.
* The min,max parameters are the initial search * The `min,max` parameters are the initial search
* interval for the optimisation. * interval for the optimisation.
*
* If `err` is true print the estimate error.
*/ */
double numeric_mode(double min, double max, double numeric_mode(double min, double max,
gsl_function *pdf); gsl_function *pdf,
int err);
/* Numerically computes the FWHM of a PDF /* Numerically computes the FWHM of a PDF
@ -29,6 +32,9 @@ double numeric_mode(double min, double max,
* `fwhm_equation`. Two searches are performed in * `fwhm_equation`. Two searches are performed in
* [min, mode] and [mode, max] that will give the * [min, mode] and [mode, max] that will give the
* two solutions x, x. The FWHM is then x-x. * two solutions x, x. The FWHM is then x-x.
*
* If `err` is true print the estimate error.
*/ */
double numeric_fwhm(double min, double max, double numeric_fwhm(double min, double max,
gsl_function *pdf); gsl_function *pdf,
int err);