ex-1: implement half-sample mode
This commit is contained in:
parent
31821d8033
commit
fd6fa78566
144
ex-1/bootstrap.c
144
ex-1/bootstrap.c
@ -1,10 +1,106 @@
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <gsl/gsl_rstat.h>
|
||||
#include <gsl/gsl_vector.h>
|
||||
#include <gsl/gsl_statistics_double.h>
|
||||
|
||||
#include "bootstrap.h"
|
||||
|
||||
|
||||
/* Function that compares doubles for sorting:
|
||||
* x > y ⇒ 1
|
||||
* x == y ⇒ 0
|
||||
* x < y ⇒ -1
|
||||
*/
|
||||
int cmp_double (const void *xp, const void *yp) {
|
||||
double x = *(double*)xp,
|
||||
y = *(double*)yp;
|
||||
return x > y ? 1 : (x == y ? 0 : -1);
|
||||
}
|
||||
|
||||
|
||||
/* Returns the (rounded) mean index of all
|
||||
* components of `v` that are equal to `x`.
|
||||
* This function is used to handle duplicate
|
||||
* data (called "ties") in `hsm()`.
|
||||
*/
|
||||
size_t mean_index(gsl_vector *v, double x) {
|
||||
gsl_rstat_workspace *w = gsl_rstat_alloc();
|
||||
|
||||
for (size_t i = 0; i < v->size; i++) {
|
||||
if (gsl_vector_get(v, i) == x)
|
||||
gsl_rstat_add((double)i, w);
|
||||
}
|
||||
int mean = gsl_rstat_mean(w);
|
||||
gsl_rstat_free(w);
|
||||
|
||||
return round(mean);
|
||||
}
|
||||
|
||||
|
||||
/* Computes the half-sample mode (also called the Robertson-Cryer
|
||||
* mode estimator) of the sample `x` containing `n` observations.
|
||||
*
|
||||
* It is based on repeatedly finding the modal interval (interval
|
||||
* containing the most observations) of half of the sample.
|
||||
* This implementation is based on the `hsm()` function from the
|
||||
* modeest[1] R package.
|
||||
*
|
||||
* [1]: https://rdrr.io/cran/modeest/man/hsm.html
|
||||
*/
|
||||
double hsm(double *x, size_t n) {
|
||||
int i, k;
|
||||
gsl_vector *diffs_full = gsl_vector_calloc(n-n/2);
|
||||
|
||||
/* Divide the sample in two halves and compute
|
||||
* the paired differences between the upper and
|
||||
* lower halves. The index of the min diff. gives
|
||||
* the start of the modal interval. Repeat on the
|
||||
* new interval until three or less points are left.
|
||||
*/
|
||||
while (n > 3) {
|
||||
k = n/2;
|
||||
|
||||
// lower/upper halves of x
|
||||
gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector;
|
||||
gsl_vector lower = gsl_vector_view_array(x, n-k).vector;
|
||||
|
||||
// restrict diffs_full to length n-k
|
||||
gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector;
|
||||
|
||||
// compute the difference upper-lower
|
||||
gsl_vector_memcpy(&diffs, &upper);
|
||||
gsl_vector_sub(&diffs, &lower);
|
||||
|
||||
// find minimum while handling ties
|
||||
i = mean_index(&diffs, gsl_vector_min(&diffs));
|
||||
|
||||
/* If the minumium difference is 0 we found
|
||||
* the hsm so we set n=1 to break the loop.
|
||||
*/
|
||||
x += i;
|
||||
n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k;
|
||||
}
|
||||
|
||||
// free memory
|
||||
gsl_vector_free(diffs_full);
|
||||
|
||||
/* If the sample is has three points the hsm
|
||||
* is the average of the two closer ones.
|
||||
*/
|
||||
if (n == 3) {
|
||||
if (2*x[1] - x[0] - x[2] > 0)
|
||||
return gsl_stats_mean(x+1, 1, 2);
|
||||
return gsl_stats_mean(x, 1, 2);
|
||||
}
|
||||
|
||||
/* Otherwise (smaller than 3) the hsm is just
|
||||
* the mean of the points.
|
||||
*/
|
||||
return gsl_stats_mean(x, 1, n);
|
||||
}
|
||||
|
||||
|
||||
/* Computes an approximation to the asymptotic median
|
||||
* and its standard deviation by bootstrapping (ie
|
||||
* repeated resampling) the original `sample`, `boots`
|
||||
@ -27,7 +123,7 @@ uncert bootstrap_median(
|
||||
|
||||
for (size_t i = 0; i < boots; i++) {
|
||||
/* The sampling is simply done by generating
|
||||
* an array index uniformely in [0, n-1].
|
||||
* an array index uniformly in [0, n-1].
|
||||
*/
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
size_t choice = gsl_rng_uniform_int(r, n);
|
||||
@ -43,5 +139,51 @@ uncert bootstrap_median(
|
||||
median.n = gsl_stats_mean(values, 1, boots);
|
||||
median.s = gsl_stats_sd(values, 1, boots);
|
||||
|
||||
// free memory
|
||||
free(values);
|
||||
|
||||
return median;
|
||||
}
|
||||
|
||||
|
||||
/* Computes an approximation to the asymptotic mode
|
||||
* and its standard deviation by bootstrapping (ie
|
||||
* repeated resampling) the original `sample`, `boots`
|
||||
* times.
|
||||
*
|
||||
* The functions returns an `uncert` pair of mean and
|
||||
* stddev of the modes computed on each sample.
|
||||
*/
|
||||
uncert bootstrap_mode(
|
||||
const gsl_rng *r,
|
||||
double *sample, size_t n,
|
||||
int boots) {
|
||||
|
||||
double *values = calloc(boots, sizeof(double));
|
||||
double *boot = calloc(n, sizeof(double));
|
||||
|
||||
for (size_t i = 0; i < boots; i++) {
|
||||
/* The sampling is simply done by generating
|
||||
* an array index uniformely in [0, n-1].
|
||||
*/
|
||||
for (size_t j = 0; j < n; j++) {
|
||||
size_t choice = gsl_rng_uniform_int(r, n);
|
||||
boot[j] = sample[choice];
|
||||
}
|
||||
qsort(boot, n, sizeof(double), cmp_double);
|
||||
values[i] = hsm(boot, n);
|
||||
}
|
||||
|
||||
/* Compute mean and stddev of the modes
|
||||
* of each newly bootstrapped sample.
|
||||
*/
|
||||
uncert mode;
|
||||
mode.n = gsl_stats_mean(values, 1, boots);
|
||||
mode.s = gsl_stats_sd(values, 1, boots);
|
||||
|
||||
// free memory
|
||||
free(values);
|
||||
free(boot);
|
||||
|
||||
return mode;
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <gsl/gsl_rng.h>
|
||||
|
||||
#pragma once
|
||||
|
||||
/* A pair structure that represent
|
||||
* a value with an uncertainty
|
||||
@ -10,15 +11,37 @@ typedef struct {
|
||||
} uncert;
|
||||
|
||||
|
||||
/* Function that compare doubles for sorting:
|
||||
* x > y ⇒ 1
|
||||
* x == y ⇒ 0
|
||||
* x < y ⇒ -1
|
||||
*/
|
||||
int cmp_double (const void *xp, const void *yp);
|
||||
|
||||
|
||||
/* Computes an approximation to the asymptotic median
|
||||
* and its standard deviation by bootstrapping (ie
|
||||
* repeated resampling) the original `sample`, `boots`
|
||||
* times.
|
||||
*
|
||||
* The functions returns an `uncert` pair of mean and
|
||||
* stdev of the medians computed on each sample.
|
||||
* sdtdev of the medians computed on each sample.
|
||||
*/
|
||||
uncert bootstrap_median(
|
||||
const gsl_rng *r,
|
||||
double *sample, size_t n,
|
||||
int boots);
|
||||
|
||||
|
||||
/* Computes an approximation to the asymptotic mode
|
||||
* and its standard deviation by bootstrapping (ie
|
||||
* repeated resampling) the original `sample`, `boots`
|
||||
* times.
|
||||
*
|
||||
* The functions returns an `uncert` pair of mean and
|
||||
* stddev of the modes computed on each sample.
|
||||
*/
|
||||
uncert bootstrap_mode(
|
||||
const gsl_rng *r,
|
||||
double *sample, size_t n,
|
||||
int boots);
|
||||
|
Loading…
Reference in New Issue
Block a user