From 29dda3e15105e2b6b949b318df0173141d349b99 Mon Sep 17 00:00:00 2001 From: rnhmjoj Date: Wed, 8 Apr 2020 12:33:11 +0200 Subject: [PATCH] ex-1: implement a KDE-based estimation of the FWHM --- ex-1/bootstrap.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++ ex-1/bootstrap.h | 18 ++++++++ ex-1/main.c | 19 ++++---- ex-1/tests.c | 43 +++++++++++------- ex-1/tests.h | 12 +++-- 5 files changed, 179 insertions(+), 26 deletions(-) diff --git a/ex-1/bootstrap.c b/ex-1/bootstrap.c index e033941..ca69985 100644 --- a/ex-1/bootstrap.c +++ b/ex-1/bootstrap.c @@ -1,10 +1,12 @@ #include #include +#include #include #include #include #include "bootstrap.h" +#include "tests.h" /* Function that compares doubles for sorting: @@ -101,6 +103,42 @@ double hsm(double *x, size_t n) { } +/* Parameters needed to compute the + * KDE function of a sample. + */ +struct kde_params { + double n; // sample size + double var; // sample variance + double *sample; // sample +}; + + +/* A KDE (kernel density estimation) is an + * approximation of a sample PDF obtained by + * convolving the sample points with a smooth + * kernel, in this case a standard gaussian. + * + * `gauss_kde(x, p)` gives the value of the PDF + * at `x`, where `p` is a `kde_params` struct. + */ +double gauss_kde(double x, void * params) { + struct kde_params p = *((struct kde_params*) params); + + /* Apply the Silverman's rule of thumb to the + * bandwidth estimation. The badwidth is given + * by the sample variance times a factor which + * depends on the number of points and dimension. + */ + double bw = p.var * pow((double)p.n*3.0/4, -2.0/5); + + double sum = 0; + for (size_t i = 0; i < p.n; i++) + sum += exp(-pow(x - p.sample[i], 2) / (2*bw)); + + return sum / sqrt(2*M_PI*bw) / p.n; +} + + /* Computes an approximation to the asymptotic median * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` @@ -188,3 +226,78 @@ uncert bootstrap_mode( return mode; } + + +/* Computes an approximation to the asymptotic fwhm + * and its standard deviation by bootstrapping (ie + * repeated resampling) the original `sample`, `boots` + * times. + * + * `min,max` are the bounds of interval that is + * expected to contain the mode of the sample. + * + * The functions returns an `uncert` pair of mean and + * stddev of the fwhms computed on each sample. + */ +uncert bootstrap_fwhm( + const gsl_rng *r, + double min, double max, + double *sample, size_t n, + int boots) { + /* We use a running statistics to compute + * a trimmed mean/variance of the sample. + */ + gsl_rstat_workspace* w = gsl_rstat_alloc(); + + // fwhm values and resample + double *values = calloc(boots, sizeof(double)); + double *boot = calloc(n, sizeof(double)); + + // set the size here because it's fixed + struct kde_params p; + p.n = n; + + // struct for numeric_fwhm + gsl_function pdf; + pdf.function = &gauss_kde; + + for (size_t i = 0; i < boots; i++) { + gsl_rstat_reset(w); + + /* The sampling is simply done by generating + * an array index uniformely in [0, n-1]. + */ + for (size_t j = 0; j < n; j++) { + double x = sample[gsl_rng_uniform_int(r, n)]; + boot[j] = x; + /* Remove too extreme values from the + * the trimmed statistics workspace. + */ + if (fabs(x) < 10) gsl_rstat_add(x, w); + } + + /* Set the trimmed variance of the sample as + * the (uncorrected) variance of the gaussian + * kernel estimator. + */ + p.sample = boot; + p.var = gsl_rstat_variance(w); + pdf.params = &p; + + values[i] = numeric_fwhm(min, max, &pdf, 0); + } + + /* Compute mean and stddev of the fwhms + * of each newly bootstrapped sample. + */ + uncert fwhm; + fwhm.n = gsl_stats_mean(values, 1, boots); + fwhm.s = gsl_stats_sd(values, 1, boots); + + // free memory + free(values); + free(boot); + gsl_rstat_free(w); + + return fwhm; +} diff --git a/ex-1/bootstrap.h b/ex-1/bootstrap.h index 47fd9d8..7248ccd 100644 --- a/ex-1/bootstrap.h +++ b/ex-1/bootstrap.h @@ -45,3 +45,21 @@ uncert bootstrap_mode( const gsl_rng *r, double *sample, size_t n, int boots); + + +/* Computes an approximation to the asymptotic fwhm + * and its standard deviation by bootstrapping (ie + * repeated resampling) the original `sample`, `boots` + * times. + * + * `min,max` are the bounds of interval that is + * expected to contain the mode of the sample. + * + * The functions returns an `uncert` pair of mean and + * stddev of the fwhms computed on each sample. + */ +uncert bootstrap_fwhm( + const gsl_rng *r, + double min, double max, + double *sample, size_t n, + int boots); diff --git a/ex-1/main.c b/ex-1/main.c index f73f1d5..0ef359d 100644 --- a/ex-1/main.c +++ b/ex-1/main.c @@ -24,8 +24,8 @@ int main(int argc, char** argv) { size_t samples = 100000; double* sample = calloc(samples, sizeof(double)); size_t bins = 40; - double min = -20; - double max = 20; + double min = -10; + double max = 10; gsl_histogram* hist = gsl_histogram_alloc(bins); gsl_histogram_set_ranges_uniform(hist, min, max); @@ -86,7 +86,7 @@ int main(int argc, char** argv) { pdf.function = &landau_pdf; pdf.params = NULL; - double mode_e = numeric_mode(min, max, &pdf); + double mode_e = numeric_mode(min, max, &pdf, 1); uncert mode_o = bootstrap_mode(r, sample, samples, 100); // print the results @@ -94,6 +94,7 @@ int main(int argc, char** argv) { fprintf(stderr, "expected mode: %.7f\n", mode_e); fprintf(stderr, "observed mode: %.4f±%.4f\n", mode_o.n, mode_o.s); + // t-test double t = fabs(mode_e - mode_o.n)/mode_o.s; double p = 1 - erf(t/sqrt(2)); fprintf(stderr, "\n## t-test\n"); @@ -112,16 +113,17 @@ int main(int argc, char** argv) { */ fprintf(stderr, "\n\n# FWHM comparison\n"); - double fwhm_e = numeric_fwhm(min, max, &pdf); - //uncert fwhm_o = bootstrap_fwhm(r, sample, samples, 100); + double fwhm_e = numeric_fwhm(min, max, &pdf, 1); + uncert fwhm_o = bootstrap_fwhm(r, min, max, sample, samples, 100); // print the results fprintf(stderr, "\n## Results\n"); fprintf(stderr, "expected fwhm: %.7f\n", fwhm_e); - //fprintf(stderr, "observed fwhm: %.4f±%.4f\n", fwhm_o.n, fwhm_o.s); + fprintf(stderr, "observed fwhm: %.4f±%.4f\n", fwhm_o.n, fwhm_o.s); - //t = fabs(fwhm_e - fwhm_o.n)/fwhm_o.s; - //p = 1 - erf(t/sqrt(2)); + // t-test + t = fabs(fwhm_e - fwhm_o.n)/fwhm_o.s; + p = 1 - erf(t/sqrt(2)); fprintf(stderr, "\n## t-test\n"); fprintf(stderr, "t=%.3f\n", t); fprintf(stderr, "p=%.3f\n", p); @@ -141,6 +143,7 @@ int main(int argc, char** argv) { fprintf(stderr, "expected median: %.7f\n", med_e); fprintf(stderr, "observed median: %.4f±%.4f\n", med_o.n, med_o.s); + // t-test t = fabs(med_e - med_o.n)/med_o.s; p = 1 - erf(t/sqrt(2)); fprintf(stderr, "\n## t-test\n"); diff --git a/ex-1/tests.c b/ex-1/tests.c index 95d2035..9d54ba7 100644 --- a/ex-1/tests.c +++ b/ex-1/tests.c @@ -74,11 +74,14 @@ double negate_func(double x, void * fp) { /* Numerically computes the mode of a Landau * distribution by maximising the derivative. - * The min,max parameters are the initial search + * The `min,max` parameters are the initial search * interval for the optimisation. + * + * If `err` is true print the estimate error. */ double numeric_mode(double min, double max, - gsl_function *pdf) { + gsl_function *pdf, + int err) { /* Negate the PDF to maximise it by * using a GSL minimisation method. @@ -89,22 +92,22 @@ double numeric_mode(double min, double max, npdf.params = pdf; // initialize minimization - double guess = 0; + double x = 0; int iter = 0; int max_iter = 100; double prec = 1e-7; int status; - const gsl_min_fminimizer_type * T = gsl_min_fminimizer_brent; - gsl_min_fminimizer * s = gsl_min_fminimizer_alloc(T); - gsl_min_fminimizer_set(s, &npdf, guess, min, max); + const gsl_min_fminimizer_type *T = gsl_min_fminimizer_brent; + gsl_min_fminimizer *s = gsl_min_fminimizer_alloc(T); + gsl_min_fminimizer_set(s, &npdf, x, min, max); // minimisation do { iter++; status = gsl_min_fminimizer_iterate(s); - guess = gsl_min_fminimizer_x_minimum(s); - min = gsl_min_fminimizer_x_lower(s); - max = gsl_min_fminimizer_x_upper(s); + x = gsl_min_fminimizer_x_minimum(s); + min = gsl_min_fminimizer_x_lower(s); + max = gsl_min_fminimizer_x_upper(s); status = gsl_min_test_interval(min, max, prec, prec); } while (status == GSL_CONTINUE && iter < max_iter); @@ -112,11 +115,12 @@ double numeric_mode(double min, double max, /* The error is simply given by the width of * the final interval containing the solution */ - fprintf(stderr, "mode error: %.3g\n", max - min); + if (err) + fprintf(stderr, "mode error: %.3g\n", max - min); // free memory gsl_min_fminimizer_free(s); - return guess; + return x; } @@ -154,13 +158,16 @@ double fwhm_equation(double x, void* params) { * `fwhm_equation`. Two searches are performed in * [min, mode] and [mode, max] that will give the * two solutions x₋, x₊. The FWHM is then x₊-x₋. + * + * If `err` is true print the estimate error. */ double numeric_fwhm(double min, double max, - gsl_function *pdf) { + gsl_function *pdf, + int err) { /* Create the gls_function structure * for the equation to be solved. */ - double mode = numeric_mode(min, max, pdf); + double mode = numeric_mode(min, max, pdf, 0); struct fwhm_params p = { pdf->function(mode, pdf->params)/2, pdf @@ -169,8 +176,8 @@ double numeric_fwhm(double min, double max, equation.function = &fwhm_equation; equation.params = &p; - const gsl_root_fsolver_type * T = gsl_root_fsolver_brent; - gsl_root_fsolver * s = gsl_root_fsolver_alloc(T); + const gsl_root_fsolver_type *T = gsl_root_fsolver_brent; + gsl_root_fsolver *s = gsl_root_fsolver_alloc(T); // initialize minimization for x₋ double x, fmin, fmax; @@ -189,6 +196,7 @@ double numeric_fwhm(double min, double max, status = gsl_min_test_interval(fmin, fmax, 0, prec); } double x_low = x; + double err_low = fmax - fmin; // initialize minimization for x₊ gsl_root_fsolver_set(s, &equation, mode, max); @@ -203,6 +211,11 @@ double numeric_fwhm(double min, double max, status = gsl_min_test_interval(fmin, fmax, 0, prec); } double x_upp = x; + double err_upp = fmax - fmin; + + if (err) + fprintf(stderr, "fhwm error: %g\n", + sqrt(err_low*err_low + err_upp*err_upp)); // free memory gsl_root_fsolver_free(s); diff --git a/ex-1/tests.h b/ex-1/tests.h index 9c41cbc..6320040 100644 --- a/ex-1/tests.h +++ b/ex-1/tests.h @@ -15,11 +15,14 @@ double kolmogorov_cdf(double D, int n); /* Numerically computes the mode of a Landau * distribution by maximising the derivative. - * The min,max parameters are the initial search + * The `min,max` parameters are the initial search * interval for the optimisation. + * + * If `err` is true print the estimate error. */ double numeric_mode(double min, double max, - gsl_function *pdf); + gsl_function *pdf, + int err); /* Numerically computes the FWHM of a PDF @@ -29,6 +32,9 @@ double numeric_mode(double min, double max, * `fwhm_equation`. Two searches are performed in * [min, mode] and [mode, max] that will give the * two solutions x₋, x₊. The FWHM is then x₊-x₋. + * + * If `err` is true print the estimate error. */ double numeric_fwhm(double min, double max, - gsl_function *pdf); + gsl_function *pdf, + int err);