analistica/ex-1/bootstrap.c
rnhmjoj 602e08b2a2 ex-1: compute exact median
The approximation of the running statistics isn't accurate enough.
2020-07-05 11:37:20 +02:00

372 lines
9.5 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdlib.h>
#include <math.h>
#include <gsl/gsl_roots.h>
#include <gsl/gsl_rstat.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_statistics_double.h>
#include "bootstrap.h"
#include "tests.h"
/* Function that compares doubles for sorting:
* x > y ⇒ 1
* x == y ⇒ 0
* x < y ⇒ -1
*/
int cmp_double (const void *xp, const void *yp) {
double x = *(double*)xp,
y = *(double*)yp;
return x > y ? 1 : (x == y ? 0 : -1);
}
/* Returns the (rounded) mean index of all
* components of `v` that are equal to `x`.
* This function is used to handle duplicate
* data (called "ties") in `hsm()`.
*/
size_t mean_index(gsl_vector *v, double x) {
gsl_rstat_workspace *w = gsl_rstat_alloc();
for (size_t i = 0; i < v->size; i++) {
if (gsl_vector_get(v, i) == x)
gsl_rstat_add((double)i, w);
}
int mean = gsl_rstat_mean(w);
gsl_rstat_free(w);
return round(mean);
}
/* Computes the half-sample mode (also called the Robertson-Cryer
* mode estimator) of the sample `x` containing `n` observations.
* Important: the sample is assumed to be sorted.
*
* It is based on repeatedly finding the half modal interval (smallest
* interval containing half of the observations) of the sample.
* This implementation is based on the `hsm()` function from the
* modeest[1] R package.
*
* [1]: https://rdrr.io/cran/modeest/man/hsm.html
*/
double hsm(double *x, size_t n) {
int i, k;
gsl_vector *diffs_full = gsl_vector_calloc(n-n/2);
/* Divide the sample in two halves and compute
* the paired differences between the upper and
* lower halves. The index of the min diff. gives
* the start of the modal interval. Repeat on the
* new interval until three or less points are left.
*/
while (n > 3) {
k = n/2;
// lower/upper halves of x
gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector;
gsl_vector lower = gsl_vector_view_array(x, n-k).vector;
// restrict diffs_full to length n-k
gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector;
// compute the difference upper-lower
gsl_vector_memcpy(&diffs, &upper);
gsl_vector_sub(&diffs, &lower);
// find minimum while handling ties
i = mean_index(&diffs, gsl_vector_min(&diffs));
/* If the minumium difference is 0 we found
* the hsm so we set n=1 to break the loop.
*/
x += i;
n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k;
}
// free memory
gsl_vector_free(diffs_full);
/* If the sample is has three points the hsm
* is the average of the two closer ones.
*/
if (n == 3) {
if (2*x[1] - x[0] - x[2] > 0)
return gsl_stats_mean(x+1, 1, 2);
return gsl_stats_mean(x, 1, 2);
}
/* Otherwise (smaller than 3) the hsm is just
* the mean of the points.
*/
return gsl_stats_mean(x, 1, n);
}
/* Computes the Hill estimator γ_k of the right
* tail-index of a distribution from a sample `x`
* of `n` observations by using the last k outliers.
*
* k is computed from t as k = n^`t`, so it is
* expected that t ∈ (0,1).
*
* Important: the sample is assumed to be sorted.
*/
double hill_estimator(double *x, double t, size_t n) {
double k = pow(n, t);
double sum = 0;
for (size_t i = n - k; i < n; i++) {
sum += log(x[i]) - log(x[n - (size_t)round(k)]);
}
return k / sum;
}
/* Parameters needed to compute the
* KDE function of a sample.
*/
struct kde_params {
double n; // sample size
double var; // sample variance
double *sample; // sample
};
/* A KDE (kernel density estimation) is an
* approximation of a sample PDF obtained by
* convolving the sample points with a smooth
* kernel, in this case a standard gaussian.
*
* `gauss_kde(x, p)` gives the value of the PDF
* at `x`, where `p` is a `kde_params` struct.
*/
double gauss_kde(double x, void * params) {
struct kde_params p = *((struct kde_params*) params);
/* Apply the Silverman's rule of thumb to the
* bandwidth estimation. The badwidth is given
* by the sample variance times a factor which
* depends on the number of points and dimension.
*/
double bw = 0.777 * p.var * pow((double)p.n*3.0/4, -2.0/5);
double sum = 0;
for (size_t i = 0; i < p.n; i++)
sum += exp(-pow(x - p.sample[i], 2) / (2*bw));
return sum / sqrt(2*M_PI*bw) / p.n;
}
/* Computes an approximation to the asymptotic median
* and its standard deviation by bootstrapping (ie
* repeated resampling) the original `sample`, `boots`
* times.
*
* The functions returns an `uncert` pair of mean and
* stdev of the medians computed on each sample.
*/
uncert bootstrap_median(
const gsl_rng *r,
double *sample, size_t n,
size_t boots) {
/* We use a running statistics to not
* store the full resampled array.
*/
gsl_rstat_workspace* w = gsl_rstat_alloc();
double *values = calloc(boots, sizeof(double));
double *resample = calloc(n, sizeof(double));
for (size_t i = 0; i < boots; i++) {
/* The sampling is simply done by generating
* an array index uniformly in [0, n-1].
*/
for (size_t j = 0; j < n; j++) {
size_t choice = gsl_rng_uniform_int(r, n);
resample[j] = sample[choice];
}
values[i] = gsl_stats_median(resample, 1, n);
}
/* Compute mean and stdev of the medians
* of each newly bootstrapped sample.
*/
uncert median;
median.n = gsl_stats_mean(values, 1, boots);
median.s = gsl_stats_sd(values, 1, boots);
// free memory
gsl_rstat_free(w);
free(values);
return median;
}
/* Computes an approximation to the asymptotic mode
* and its standard deviation by bootstrapping (ie
* repeated resampling) the original `sample`, `boots`
* times.
*
* The functions returns an `uncert` pair of mean and
* stddev of the modes computed on each sample.
*/
uncert bootstrap_mode(
const gsl_rng *r,
double *sample, size_t n,
size_t boots) {
double *values = calloc(boots, sizeof(double));
double *boot = calloc(n, sizeof(double));
for (size_t i = 0; i < boots; i++) {
/* The sampling is simply done by generating
* an array index uniformely in [0, n-1].
*/
for (size_t j = 0; j < n; j++) {
size_t choice = gsl_rng_uniform_int(r, n);
boot[j] = sample[choice];
}
qsort(boot, n, sizeof(double), cmp_double);
values[i] = hsm(boot, n);
}
/* Compute mean and stddev of the modes
* of each newly bootstrapped sample.
*/
uncert mode;
mode.n = gsl_stats_mean(values, 1, boots);
mode.s = gsl_stats_sd(values, 1, boots);
// free memory
free(values);
free(boot);
return mode;
}
/* Computes an approximation to the asymptotic fwhm
* and its standard deviation by bootstrapping (ie
* repeated resampling) the original `sample`, `boots`
* times.
*
* `min,max` are the bounds of interval that is
* expected to contain the mode of the sample.
*
* The functions returns an `uncert` pair of mean and
* stddev of the fwhms computed on each sample.
*/
uncert bootstrap_fwhm(
const gsl_rng *r,
double min, double max,
double *sample, size_t n,
size_t boots) {
/* We use a running statistics to compute
* a trimmed mean/variance of the sample.
*/
gsl_rstat_workspace* w = gsl_rstat_alloc();
// fwhm values and resample
double *values = calloc(boots, sizeof(double));
double *boot = calloc(n, sizeof(double));
// set the size here because it's fixed
struct kde_params p;
p.n = n;
// struct for numeric_fwhm
gsl_function pdf;
pdf.function = &gauss_kde;
for (size_t i = 0; i < boots; i++) {
gsl_rstat_reset(w);
/* The sampling is simply done by generating
* an array index uniformely in [0, n-1].
*/
for (size_t j = 0; j < n; j++) {
double x = sample[gsl_rng_uniform_int(r, n)];
boot[j] = x;
/* Remove too extreme values from the
* the trimmed statistics workspace.
*/
if (fabs(x) < 10) gsl_rstat_add(x, w);
}
/* Set the trimmed variance of the sample as
* the (uncorrected) variance of the gaussian
* kernel estimator.
*/
p.sample = boot;
p.var = gsl_rstat_variance(w);
pdf.params = &p;
values[i] = numeric_fwhm(min, max, &pdf, 0);
}
/* Compute mean and stddev of the fwhms
* of each newly bootstrapped sample.
*/
uncert fwhm;
fwhm.n = gsl_stats_mean(values, 1, boots);
fwhm.s = gsl_stats_sd(values, 1, boots);
// free memory
free(values);
free(boot);
gsl_rstat_free(w);
return fwhm;
}
/* Computes an approximation to the asymptotic right
* tail-index and its standard deviation by bootstrapping
* (ie repeated resampling) the original `sample`, `boots`
* times. The tail-index is estimated by the Hill estimator,
* hence the name of the function. See `hill_estimator()`
* for the parameter `t` meaning.
*
* The functions returns an `uncert` pair of mean and
* stddev of the modes computed on each sample.
*/
uncert bootstrap_hill(
const gsl_rng *r,
double t,
double *sample, size_t n,
size_t boots) {
double *values = calloc(boots, sizeof(double));
double *boot = calloc(n, sizeof(double));
for (size_t i = 0; i < boots; i++) {
/* The sampling is simply done by generating
* an array index uniformely in [0, n-1].
*/
for (size_t j = 0; j < n; j++) {
size_t choice = gsl_rng_uniform_int(r, n);
boot[j] = sample[choice];
}
qsort(boot, n, sizeof(double), cmp_double);
values[i] = hill_estimator(boot, t, n);
}
/* Compute mean and stddev of the modes
* of each newly bootstrapped sample.
*/
uncert index;
index.n = gsl_stats_mean(values, 1, boots);
index.s = gsl_stats_sd(values, 1, boots);
// free memory
free(values);
free(boot);
return index;
}