#include #include #include #include #include #include "bootstrap.h" /* Function that compares doubles for sorting: * x > y ⇒ 1 * x == y ⇒ 0 * x < y ⇒ -1 */ int cmp_double (const void *xp, const void *yp) { double x = *(double*)xp, y = *(double*)yp; return x > y ? 1 : (x == y ? 0 : -1); } /* Returns the (rounded) mean index of all * components of `v` that are equal to `x`. * This function is used to handle duplicate * data (called "ties") in `hsm()`. */ size_t mean_index(gsl_vector *v, double x) { gsl_rstat_workspace *w = gsl_rstat_alloc(); for (size_t i = 0; i < v->size; i++) { if (gsl_vector_get(v, i) == x) gsl_rstat_add((double)i, w); } int mean = gsl_rstat_mean(w); gsl_rstat_free(w); return round(mean); } /* Computes the half-sample mode (also called the Robertson-Cryer * mode estimator) of the sample `x` containing `n` observations. * * It is based on repeatedly finding the modal interval (interval * containing the most observations) of half of the sample. * This implementation is based on the `hsm()` function from the * modeest[1] R package. * * [1]: https://rdrr.io/cran/modeest/man/hsm.html */ double hsm(double *x, size_t n) { int i, k; gsl_vector *diffs_full = gsl_vector_calloc(n-n/2); /* Divide the sample in two halves and compute * the paired differences between the upper and * lower halves. The index of the min diff. gives * the start of the modal interval. Repeat on the * new interval until three or less points are left. */ while (n > 3) { k = n/2; // lower/upper halves of x gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector; gsl_vector lower = gsl_vector_view_array(x, n-k).vector; // restrict diffs_full to length n-k gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector; // compute the difference upper-lower gsl_vector_memcpy(&diffs, &upper); gsl_vector_sub(&diffs, &lower); // find minimum while handling ties i = mean_index(&diffs, gsl_vector_min(&diffs)); /* If the minumium difference is 0 we found * the hsm so we set n=1 to break the loop. */ x += i; n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k; } // free memory gsl_vector_free(diffs_full); /* If the sample is has three points the hsm * is the average of the two closer ones. */ if (n == 3) { if (2*x[1] - x[0] - x[2] > 0) return gsl_stats_mean(x+1, 1, 2); return gsl_stats_mean(x, 1, 2); } /* Otherwise (smaller than 3) the hsm is just * the mean of the points. */ return gsl_stats_mean(x, 1, n); } /* Computes an approximation to the asymptotic median * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` * times. * * The functions returns an `uncert` pair of mean and * stdev of the medians computed on each sample. */ uncert bootstrap_median( const gsl_rng *r, double *sample, size_t n, int boots) { /* We use a running statistics to not * store the full resampled array. */ gsl_rstat_workspace* w = gsl_rstat_alloc(); double *values = calloc(boots, sizeof(double)); for (size_t i = 0; i < boots; i++) { /* The sampling is simply done by generating * an array index uniformly in [0, n-1]. */ for (size_t j = 0; j < n; j++) { size_t choice = gsl_rng_uniform_int(r, n); gsl_rstat_add(sample[choice], w); } values[i] = gsl_rstat_median(w); } /* Compute mean and stdev of the medians * of each newly bootstrapped sample. */ uncert median; median.n = gsl_stats_mean(values, 1, boots); median.s = gsl_stats_sd(values, 1, boots); // free memory gsl_rstat_free(w); free(values); return median; } /* Computes an approximation to the asymptotic mode * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` * times. * * The functions returns an `uncert` pair of mean and * stddev of the modes computed on each sample. */ uncert bootstrap_mode( const gsl_rng *r, double *sample, size_t n, int boots) { double *values = calloc(boots, sizeof(double)); double *boot = calloc(n, sizeof(double)); for (size_t i = 0; i < boots; i++) { /* The sampling is simply done by generating * an array index uniformely in [0, n-1]. */ for (size_t j = 0; j < n; j++) { size_t choice = gsl_rng_uniform_int(r, n); boot[j] = sample[choice]; } qsort(boot, n, sizeof(double), cmp_double); values[i] = hsm(boot, n); } /* Compute mean and stddev of the modes * of each newly bootstrapped sample. */ uncert mode; mode.n = gsl_stats_mean(values, 1, boots); mode.s = gsl_stats_sd(values, 1, boots); // free memory free(values); free(boot); return mode; }