371 lines
9.5 KiB
C
371 lines
9.5 KiB
C
#include <stdlib.h>
|
||
#include <math.h>
|
||
#include <gsl/gsl_roots.h>
|
||
#include <gsl/gsl_rstat.h>
|
||
#include <gsl/gsl_vector.h>
|
||
#include <gsl/gsl_statistics_double.h>
|
||
|
||
#include "bootstrap.h"
|
||
#include "tests.h"
|
||
|
||
|
||
/* Function that compares doubles for sorting:
|
||
* x > y ⇒ 1
|
||
* x == y ⇒ 0
|
||
* x < y ⇒ -1
|
||
*/
|
||
int cmp_double (const void *xp, const void *yp) {
|
||
double x = *(double*)xp,
|
||
y = *(double*)yp;
|
||
return x > y ? 1 : (x == y ? 0 : -1);
|
||
}
|
||
|
||
|
||
/* Returns the (rounded) mean index of all
|
||
* components of `v` that are equal to `x`.
|
||
* This function is used to handle duplicate
|
||
* data (called "ties") in `hsm()`.
|
||
*/
|
||
size_t mean_index(gsl_vector *v, double x) {
|
||
gsl_rstat_workspace *w = gsl_rstat_alloc();
|
||
|
||
for (size_t i = 0; i < v->size; i++) {
|
||
if (gsl_vector_get(v, i) == x)
|
||
gsl_rstat_add((double)i, w);
|
||
}
|
||
int mean = gsl_rstat_mean(w);
|
||
gsl_rstat_free(w);
|
||
|
||
return round(mean);
|
||
}
|
||
|
||
|
||
/* Computes the half-sample mode (also called the Robertson-Cryer
|
||
* mode estimator) of the sample `x` containing `n` observations.
|
||
* Important: the sample is assumed to be sorted.
|
||
*
|
||
* It is based on repeatedly finding the half modal interval (smallest
|
||
* interval containing half of the observations) of the sample.
|
||
* This implementation is based on the `hsm()` function from the
|
||
* modeest[1] R package.
|
||
*
|
||
* [1]: https://rdrr.io/cran/modeest/man/hsm.html
|
||
*/
|
||
double hsm(double *x, size_t n) {
|
||
int i, k;
|
||
gsl_vector *diffs_full = gsl_vector_calloc(n-n/2);
|
||
|
||
/* Divide the sample in two halves and compute
|
||
* the paired differences between the upper and
|
||
* lower halves. The index of the min diff. gives
|
||
* the start of the modal interval. Repeat on the
|
||
* new interval until three or less points are left.
|
||
*/
|
||
while (n > 3) {
|
||
k = n/2;
|
||
|
||
// lower/upper halves of x
|
||
gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector;
|
||
gsl_vector lower = gsl_vector_view_array(x, n-k).vector;
|
||
|
||
// restrict diffs_full to length n-k
|
||
gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector;
|
||
|
||
// compute the difference upper-lower
|
||
gsl_vector_memcpy(&diffs, &upper);
|
||
gsl_vector_sub(&diffs, &lower);
|
||
|
||
// find minimum while handling ties
|
||
i = mean_index(&diffs, gsl_vector_min(&diffs));
|
||
|
||
/* If the minumium difference is 0 we found
|
||
* the hsm so we set n=1 to break the loop.
|
||
*/
|
||
x += i;
|
||
n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k;
|
||
}
|
||
|
||
// free memory
|
||
gsl_vector_free(diffs_full);
|
||
|
||
/* If the sample is has three points the hsm
|
||
* is the average of the two closer ones.
|
||
*/
|
||
if (n == 3) {
|
||
if (2*x[1] - x[0] - x[2] > 0)
|
||
return gsl_stats_mean(x+1, 1, 2);
|
||
return gsl_stats_mean(x, 1, 2);
|
||
}
|
||
|
||
/* Otherwise (smaller than 3) the hsm is just
|
||
* the mean of the points.
|
||
*/
|
||
return gsl_stats_mean(x, 1, n);
|
||
}
|
||
|
||
|
||
/* Computes the Hill estimator γ_k of the right
|
||
* tail-index of a distribution from a sample `x`
|
||
* of `n` observations by using the last k outliers.
|
||
*
|
||
* k is computed from t as k = n^`t`, so it is
|
||
* expected that t ∈ (0,1).
|
||
*
|
||
* Important: the sample is assumed to be sorted.
|
||
*/
|
||
double hill_estimator(double *x, double t, size_t n) {
|
||
double k = pow(n, t);
|
||
double sum = 0;
|
||
for (size_t i = n - k; i < n; i++) {
|
||
sum += log(x[i]) - log(x[n - (size_t)round(k)]);
|
||
}
|
||
|
||
return k / sum;
|
||
}
|
||
|
||
|
||
/* Parameters needed to compute the
|
||
* KDE function of a sample.
|
||
*/
|
||
struct kde_params {
|
||
double n; // sample size
|
||
double var; // sample variance
|
||
double *sample; // sample
|
||
};
|
||
|
||
|
||
/* A KDE (kernel density estimation) is an
|
||
* approximation of a sample PDF obtained by
|
||
* convolving the sample points with a smooth
|
||
* kernel, in this case a standard gaussian.
|
||
*
|
||
* `gauss_kde(x, p)` gives the value of the PDF
|
||
* at `x`, where `p` is a `kde_params` struct.
|
||
*/
|
||
double gauss_kde(double x, void * params) {
|
||
struct kde_params p = *((struct kde_params*) params);
|
||
|
||
/* Apply the Silverman's rule of thumb to the
|
||
* bandwidth estimation. The badwidth is given
|
||
* by the sample variance times a factor which
|
||
* depends on the number of points and dimension.
|
||
*/
|
||
double bw = 0.4 * p.var * pow((double)p.n*3.0/4, -2.0/5);
|
||
|
||
double sum = 0;
|
||
for (size_t i = 0; i < p.n; i++)
|
||
sum += exp(-pow(x - p.sample[i], 2) / (2*bw));
|
||
|
||
return sum / sqrt(2*M_PI*bw) / p.n;
|
||
}
|
||
|
||
|
||
/* Computes an approximation to the asymptotic median
|
||
* and its standard deviation by bootstrapping (ie
|
||
* repeated resampling) the original `sample`, `boots`
|
||
* times.
|
||
*
|
||
* The functions returns an `uncert` pair of mean and
|
||
* stdev of the medians computed on each sample.
|
||
*/
|
||
uncert bootstrap_median(
|
||
const gsl_rng *r,
|
||
double *sample, size_t n,
|
||
size_t boots) {
|
||
|
||
/* We use a running statistics to not
|
||
* store the full resampled array.
|
||
*/
|
||
gsl_rstat_workspace* w = gsl_rstat_alloc();
|
||
|
||
double *values = calloc(boots, sizeof(double));
|
||
|
||
for (size_t i = 0; i < boots; i++) {
|
||
/* The sampling is simply done by generating
|
||
* an array index uniformly in [0, n-1].
|
||
*/
|
||
for (size_t j = 0; j < n; j++) {
|
||
size_t choice = gsl_rng_uniform_int(r, n);
|
||
gsl_rstat_add(sample[choice], w);
|
||
}
|
||
values[i] = gsl_rstat_median(w);
|
||
}
|
||
|
||
/* Compute mean and stdev of the medians
|
||
* of each newly bootstrapped sample.
|
||
*/
|
||
uncert median;
|
||
median.n = gsl_stats_mean(values, 1, boots);
|
||
median.s = gsl_stats_sd(values, 1, boots);
|
||
|
||
// free memory
|
||
gsl_rstat_free(w);
|
||
free(values);
|
||
|
||
return median;
|
||
}
|
||
|
||
|
||
/* Computes an approximation to the asymptotic mode
|
||
* and its standard deviation by bootstrapping (ie
|
||
* repeated resampling) the original `sample`, `boots`
|
||
* times.
|
||
*
|
||
* The functions returns an `uncert` pair of mean and
|
||
* stddev of the modes computed on each sample.
|
||
*/
|
||
uncert bootstrap_mode(
|
||
const gsl_rng *r,
|
||
double *sample, size_t n,
|
||
size_t boots) {
|
||
|
||
double *values = calloc(boots, sizeof(double));
|
||
double *boot = calloc(n, sizeof(double));
|
||
|
||
for (size_t i = 0; i < boots; i++) {
|
||
/* The sampling is simply done by generating
|
||
* an array index uniformely in [0, n-1].
|
||
*/
|
||
for (size_t j = 0; j < n; j++) {
|
||
size_t choice = gsl_rng_uniform_int(r, n);
|
||
boot[j] = sample[choice];
|
||
}
|
||
qsort(boot, n, sizeof(double), cmp_double);
|
||
values[i] = hsm(boot, n);
|
||
}
|
||
|
||
/* Compute mean and stddev of the modes
|
||
* of each newly bootstrapped sample.
|
||
*/
|
||
uncert mode;
|
||
mode.n = gsl_stats_mean(values, 1, boots);
|
||
mode.s = gsl_stats_sd(values, 1, boots);
|
||
|
||
// free memory
|
||
free(values);
|
||
free(boot);
|
||
|
||
return mode;
|
||
}
|
||
|
||
|
||
/* Computes an approximation to the asymptotic fwhm
|
||
* and its standard deviation by bootstrapping (ie
|
||
* repeated resampling) the original `sample`, `boots`
|
||
* times.
|
||
*
|
||
* `min,max` are the bounds of interval that is
|
||
* expected to contain the mode of the sample.
|
||
*
|
||
* The functions returns an `uncert` pair of mean and
|
||
* stddev of the fwhms computed on each sample.
|
||
*/
|
||
uncert bootstrap_fwhm(
|
||
const gsl_rng *r,
|
||
double min, double max,
|
||
double *sample, size_t n,
|
||
size_t boots) {
|
||
/* We use a running statistics to compute
|
||
* a trimmed mean/variance of the sample.
|
||
*/
|
||
gsl_rstat_workspace* w = gsl_rstat_alloc();
|
||
|
||
// fwhm values and resample
|
||
double *values = calloc(boots, sizeof(double));
|
||
double *boot = calloc(n, sizeof(double));
|
||
|
||
// set the size here because it's fixed
|
||
struct kde_params p;
|
||
p.n = n;
|
||
|
||
// struct for numeric_fwhm
|
||
gsl_function pdf;
|
||
pdf.function = &gauss_kde;
|
||
|
||
for (size_t i = 0; i < boots; i++) {
|
||
gsl_rstat_reset(w);
|
||
|
||
/* The sampling is simply done by generating
|
||
* an array index uniformely in [0, n-1].
|
||
*/
|
||
for (size_t j = 0; j < n; j++) {
|
||
double x = sample[gsl_rng_uniform_int(r, n)];
|
||
boot[j] = x;
|
||
/* Remove too extreme values from the
|
||
* the trimmed statistics workspace.
|
||
*/
|
||
if (fabs(x) < 10) gsl_rstat_add(x, w);
|
||
}
|
||
|
||
/* Set the trimmed variance of the sample as
|
||
* the (uncorrected) variance of the gaussian
|
||
* kernel estimator.
|
||
*/
|
||
p.sample = boot;
|
||
p.var = gsl_rstat_variance(w);
|
||
pdf.params = &p;
|
||
|
||
values[i] = numeric_fwhm(min, max, &pdf, 0);
|
||
}
|
||
|
||
/* Compute mean and stddev of the fwhms
|
||
* of each newly bootstrapped sample.
|
||
*/
|
||
uncert fwhm;
|
||
fwhm.n = gsl_stats_mean(values, 1, boots);
|
||
fwhm.s = gsl_stats_sd(values, 1, boots);
|
||
|
||
// free memory
|
||
free(values);
|
||
free(boot);
|
||
gsl_rstat_free(w);
|
||
|
||
return fwhm;
|
||
}
|
||
|
||
|
||
/* Computes an approximation to the asymptotic right
|
||
* tail-index and its standard deviation by bootstrapping
|
||
* (ie repeated resampling) the original `sample`, `boots`
|
||
* times. The tail-index is estimated by the Hill estimator,
|
||
* hence the name of the function. See `hill_estimator()`
|
||
* for the parameter `t` meaning.
|
||
*
|
||
* The functions returns an `uncert` pair of mean and
|
||
* stddev of the modes computed on each sample.
|
||
*/
|
||
uncert bootstrap_hill(
|
||
const gsl_rng *r,
|
||
double t,
|
||
double *sample, size_t n,
|
||
size_t boots) {
|
||
|
||
double *values = calloc(boots, sizeof(double));
|
||
double *boot = calloc(n, sizeof(double));
|
||
|
||
for (size_t i = 0; i < boots; i++) {
|
||
/* The sampling is simply done by generating
|
||
* an array index uniformely in [0, n-1].
|
||
*/
|
||
for (size_t j = 0; j < n; j++) {
|
||
size_t choice = gsl_rng_uniform_int(r, n);
|
||
boot[j] = sample[choice];
|
||
}
|
||
qsort(boot, n, sizeof(double), cmp_double);
|
||
values[i] = hill_estimator(boot, t, n);
|
||
}
|
||
|
||
/* Compute mean and stddev of the modes
|
||
* of each newly bootstrapped sample.
|
||
*/
|
||
uncert index;
|
||
index.n = gsl_stats_mean(values, 1, boots);
|
||
index.s = gsl_stats_sd(values, 1, boots);
|
||
|
||
// free memory
|
||
free(values);
|
||
free(boot);
|
||
|
||
return index;
|
||
}
|