From fd6fa785663a1b781d1321aced021c2f6e893571 Mon Sep 17 00:00:00 2001 From: rnhmjoj Date: Sat, 4 Apr 2020 17:09:21 +0000 Subject: [PATCH] ex-1: implement half-sample mode --- ex-1/bootstrap.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++- ex-1/bootstrap.h | 25 +++++++- 2 files changed, 167 insertions(+), 2 deletions(-) diff --git a/ex-1/bootstrap.c b/ex-1/bootstrap.c index 0f10809..7d380f8 100644 --- a/ex-1/bootstrap.c +++ b/ex-1/bootstrap.c @@ -1,10 +1,106 @@ #include +#include #include +#include #include #include "bootstrap.h" +/* Function that compares doubles for sorting: + * x > y ⇒ 1 + * x == y ⇒ 0 + * x < y ⇒ -1 + */ +int cmp_double (const void *xp, const void *yp) { + double x = *(double*)xp, + y = *(double*)yp; + return x > y ? 1 : (x == y ? 0 : -1); +} + + +/* Returns the (rounded) mean index of all + * components of `v` that are equal to `x`. + * This function is used to handle duplicate + * data (called "ties") in `hsm()`. + */ +size_t mean_index(gsl_vector *v, double x) { + gsl_rstat_workspace *w = gsl_rstat_alloc(); + + for (size_t i = 0; i < v->size; i++) { + if (gsl_vector_get(v, i) == x) + gsl_rstat_add((double)i, w); + } + int mean = gsl_rstat_mean(w); + gsl_rstat_free(w); + + return round(mean); +} + + +/* Computes the half-sample mode (also called the Robertson-Cryer + * mode estimator) of the sample `x` containing `n` observations. + * + * It is based on repeatedly finding the modal interval (interval + * containing the most observations) of half of the sample. + * This implementation is based on the `hsm()` function from the + * modeest[1] R package. + * + * [1]: https://rdrr.io/cran/modeest/man/hsm.html + */ +double hsm(double *x, size_t n) { + int i, k; + gsl_vector *diffs_full = gsl_vector_calloc(n-n/2); + + /* Divide the sample in two halves and compute + * the paired differences between the upper and + * lower halves. The index of the min diff. gives + * the start of the modal interval. Repeat on the + * new interval until three or less points are left. + */ + while (n > 3) { + k = n/2; + + // lower/upper halves of x + gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector; + gsl_vector lower = gsl_vector_view_array(x, n-k).vector; + + // restrict diffs_full to length n-k + gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector; + + // compute the difference upper-lower + gsl_vector_memcpy(&diffs, &upper); + gsl_vector_sub(&diffs, &lower); + + // find minimum while handling ties + i = mean_index(&diffs, gsl_vector_min(&diffs)); + + /* If the minumium difference is 0 we found + * the hsm so we set n=1 to break the loop. + */ + x += i; + n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k; + } + + // free memory + gsl_vector_free(diffs_full); + + /* If the sample is has three points the hsm + * is the average of the two closer ones. + */ + if (n == 3) { + if (2*x[1] - x[0] - x[2] > 0) + return gsl_stats_mean(x+1, 1, 2); + return gsl_stats_mean(x, 1, 2); + } + + /* Otherwise (smaller than 3) the hsm is just + * the mean of the points. + */ + return gsl_stats_mean(x, 1, n); +} + + /* Computes an approximation to the asymptotic median * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` @@ -27,7 +123,7 @@ uncert bootstrap_median( for (size_t i = 0; i < boots; i++) { /* The sampling is simply done by generating - * an array index uniformely in [0, n-1]. + * an array index uniformly in [0, n-1]. */ for (size_t j = 0; j < n; j++) { size_t choice = gsl_rng_uniform_int(r, n); @@ -43,5 +139,51 @@ uncert bootstrap_median( median.n = gsl_stats_mean(values, 1, boots); median.s = gsl_stats_sd(values, 1, boots); + // free memory + free(values); + return median; } + + +/* Computes an approximation to the asymptotic mode + * and its standard deviation by bootstrapping (ie + * repeated resampling) the original `sample`, `boots` + * times. + * + * The functions returns an `uncert` pair of mean and + * stddev of the modes computed on each sample. + */ +uncert bootstrap_mode( + const gsl_rng *r, + double *sample, size_t n, + int boots) { + + double *values = calloc(boots, sizeof(double)); + double *boot = calloc(n, sizeof(double)); + + for (size_t i = 0; i < boots; i++) { + /* The sampling is simply done by generating + * an array index uniformely in [0, n-1]. + */ + for (size_t j = 0; j < n; j++) { + size_t choice = gsl_rng_uniform_int(r, n); + boot[j] = sample[choice]; + } + qsort(boot, n, sizeof(double), cmp_double); + values[i] = hsm(boot, n); + } + + /* Compute mean and stddev of the modes + * of each newly bootstrapped sample. + */ + uncert mode; + mode.n = gsl_stats_mean(values, 1, boots); + mode.s = gsl_stats_sd(values, 1, boots); + + // free memory + free(values); + free(boot); + + return mode; +} diff --git a/ex-1/bootstrap.h b/ex-1/bootstrap.h index 453bf06..47fd9d8 100644 --- a/ex-1/bootstrap.h +++ b/ex-1/bootstrap.h @@ -1,5 +1,6 @@ #include +#pragma once /* A pair structure that represent * a value with an uncertainty @@ -10,15 +11,37 @@ typedef struct { } uncert; +/* Function that compare doubles for sorting: + * x > y ⇒ 1 + * x == y ⇒ 0 + * x < y ⇒ -1 + */ +int cmp_double (const void *xp, const void *yp); + + /* Computes an approximation to the asymptotic median * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` * times. * * The functions returns an `uncert` pair of mean and - * stdev of the medians computed on each sample. + * sdtdev of the medians computed on each sample. */ uncert bootstrap_median( const gsl_rng *r, double *sample, size_t n, int boots); + + +/* Computes an approximation to the asymptotic mode + * and its standard deviation by bootstrapping (ie + * repeated resampling) the original `sample`, `boots` + * times. + * + * The functions returns an `uncert` pair of mean and + * stddev of the modes computed on each sample. + */ +uncert bootstrap_mode( + const gsl_rng *r, + double *sample, size_t n, + int boots);