From 56fcdd5fbbe8efb9f9a16145b1605d35d12763ff Mon Sep 17 00:00:00 2001 From: rnhmjoj Date: Fri, 3 Apr 2020 01:16:47 +0000 Subject: [PATCH] ex-1: implement bootstrapping for the median --- ex-1/bootstrap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ ex-1/bootstrap.h | 24 ++++++++++++++++++++++++ ex-1/main.c | 20 ++++++++++++-------- makefile | 2 +- 4 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 ex-1/bootstrap.c create mode 100644 ex-1/bootstrap.h diff --git a/ex-1/bootstrap.c b/ex-1/bootstrap.c new file mode 100644 index 0000000..0f10809 --- /dev/null +++ b/ex-1/bootstrap.c @@ -0,0 +1,47 @@ +#include +#include +#include + +#include "bootstrap.h" + + +/* Computes an approximation to the asymptotic median + * and its standard deviation by bootstrapping (ie + * repeated resampling) the original `sample`, `boots` + * times. + * + * The functions returns an `uncert` pair of mean and + * stdev of the medians computed on each sample. + */ +uncert bootstrap_median( + const gsl_rng *r, + double *sample, size_t n, + int boots) { + + /* We use a running statistics to not + * store the full resampled array. + */ + gsl_rstat_workspace* w = gsl_rstat_alloc(); + + double *values = calloc(boots, sizeof(double)); + + for (size_t i = 0; i < boots; i++) { + /* The sampling is simply done by generating + * an array index uniformely in [0, n-1]. + */ + for (size_t j = 0; j < n; j++) { + size_t choice = gsl_rng_uniform_int(r, n); + gsl_rstat_add(sample[choice], w); + } + values[i] = gsl_rstat_median(w); + } + + /* Compute mean and stdev of the medians + * of each newly bootstrapped sample. + */ + uncert median; + median.n = gsl_stats_mean(values, 1, boots); + median.s = gsl_stats_sd(values, 1, boots); + + return median; +} diff --git a/ex-1/bootstrap.h b/ex-1/bootstrap.h new file mode 100644 index 0000000..453bf06 --- /dev/null +++ b/ex-1/bootstrap.h @@ -0,0 +1,24 @@ +#include + + +/* A pair structure that represent + * a value with an uncertainty + */ +typedef struct { + double n; // nominal value + double s; // uncertainty +} uncert; + + +/* Computes an approximation to the asymptotic median + * and its standard deviation by bootstrapping (ie + * repeated resampling) the original `sample`, `boots` + * times. + * + * The functions returns an `uncert` pair of mean and + * stdev of the medians computed on each sample. + */ +uncert bootstrap_median( + const gsl_rng *r, + double *sample, size_t n, + int boots); diff --git a/ex-1/main.c b/ex-1/main.c index 2d9af4d..fe47307 100644 --- a/ex-1/main.c +++ b/ex-1/main.c @@ -7,6 +7,7 @@ #include "landau.h" #include "tests.h" +#include "bootstrap.h" /* Function that compare doubles for sorting: @@ -149,21 +150,24 @@ int main(int argc, char** argv) { /* Median comparison * - * Compute the median of the sample - * and compare it with QDF(1/2). + * Compute the median of the sample by bootstrapping + * it and comparing it with the QDF(1/2). */ fprintf(stderr, "\n\n# Median comparison\n"); double med_e = landau_qdf(0.5); - double med_o = gsl_stats_median_from_sorted_data( - sample, // sorted data - 1, // array stride - samples); // number of elements + uncert med_o = bootstrap_median(r, sample, samples, 100); // print the results fprintf(stderr, "\n## Results\n"); fprintf(stderr, "expected median: %.7f\n", med_e); - fprintf(stderr, "observed median: %.7f\n", med_o); - + fprintf(stderr, "observed median: %.4f±%.4f\n", med_o.n, med_o.s); + + double t = (med_e - med_o.n)/med_o.s; + double p = 1 - erf(t/sqrt(2)); + fprintf(stderr, "\n## t-test\n"); + fprintf(stderr, "t=%.3f\n", t); + fprintf(stderr, "p=%.3f\n", p); + // clean up and exit gsl_histogram_free(hist); diff --git a/makefile b/makefile index 9ce2042..c17acce 100644 --- a/makefile +++ b/makefile @@ -6,7 +6,7 @@ CCOMPILE = \ mkdir -p $(@D); \ $(CC) $(CFLAGS) $^ -o $@ -ex-1/bin/main: ex-1/main.c ex-1/landau.c ex-1/tests.c +ex-1/bin/main: ex-1/main.c ex-1/landau.c ex-1/tests.c ex-1/bootstrap.c $(CCOMPILE) ex-1/bin/pdf: ex-1/pdf.c $(CCOMPILE)