#include #include #include #include #include #include #include "bootstrap.h" #include "tests.h" /* Function that compares doubles for sorting: * x > y ⇒ 1 * x == y ⇒ 0 * x < y ⇒ -1 */ int cmp_double (const void *xp, const void *yp) { double x = *(double*)xp, y = *(double*)yp; return x > y ? 1 : (x == y ? 0 : -1); } /* Returns the (rounded) mean index of all * components of `v` that are equal to `x`. * This function is used to handle duplicate * data (called "ties") in `hsm()`. */ size_t mean_index(gsl_vector *v, double x) { gsl_rstat_workspace *w = gsl_rstat_alloc(); for (size_t i = 0; i < v->size; i++) { if (gsl_vector_get(v, i) == x) gsl_rstat_add((double)i, w); } int mean = gsl_rstat_mean(w); gsl_rstat_free(w); return round(mean); } /* Computes the half-sample mode (also called the Robertson-Cryer * mode estimator) of the sample `x` containing `n` observations. * Important: the sample is assumed to be sorted. * * It is based on repeatedly finding the half modal interval (smallest * interval containing half of the observations) of the sample. * This implementation is based on the `hsm()` function from the * modeest[1] R package. * * [1]: https://rdrr.io/cran/modeest/man/hsm.html */ double hsm(double *x, size_t n) { int i, k; gsl_vector *diffs_full = gsl_vector_calloc(n-n/2); /* Divide the sample in two halves and compute * the paired differences between the upper and * lower halves. The index of the min diff. gives * the start of the modal interval. Repeat on the * new interval until three or less points are left. */ while (n > 3) { k = n/2; // lower/upper halves of x gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector; gsl_vector lower = gsl_vector_view_array(x, n-k).vector; // restrict diffs_full to length n-k gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector; // compute the difference upper-lower gsl_vector_memcpy(&diffs, &upper); gsl_vector_sub(&diffs, &lower); // find minimum while handling ties i = mean_index(&diffs, gsl_vector_min(&diffs)); /* If the minumium difference is 0 we found * the hsm so we set n=1 to break the loop. */ x += i; n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k; } // free memory gsl_vector_free(diffs_full); /* If the sample is has three points the hsm * is the average of the two closer ones. */ if (n == 3) { if (2*x[1] - x[0] - x[2] > 0) return gsl_stats_mean(x+1, 1, 2); return gsl_stats_mean(x, 1, 2); } /* Otherwise (smaller than 3) the hsm is just * the mean of the points. */ return gsl_stats_mean(x, 1, n); } /* Computes the Hill estimator γ_k of the right * tail-index of a distribution from a sample `x` * of `n` observations by using the last k outliers. * * k is computed from t as k = n^`t`, so it is * expected that t ∈ (0,1). * * Important: the sample is assumed to be sorted. */ double hill_estimator(double *x, double t, size_t n) { double k = pow(n, t); double sum = 0; for (size_t i = n - k; i < n; i++) { sum += log(x[i]) - log(x[n - (size_t)round(k)]); } return k / sum; } /* Parameters needed to compute the * KDE function of a sample. */ struct kde_params { double n; // sample size double var; // sample variance double *sample; // sample }; /* A KDE (kernel density estimation) is an * approximation of a sample PDF obtained by * convolving the sample points with a smooth * kernel, in this case a standard gaussian. * * `gauss_kde(x, p)` gives the value of the PDF * at `x`, where `p` is a `kde_params` struct. */ double gauss_kde(double x, void * params) { struct kde_params p = *((struct kde_params*) params); /* Apply the Silverman's rule of thumb to the * bandwidth estimation. The badwidth is given * by the sample variance times a factor which * depends on the number of points and dimension. */ double bw = 0.777 * p.var * pow((double)p.n*3.0/4, -2.0/5); double sum = 0; for (size_t i = 0; i < p.n; i++) sum += exp(-pow(x - p.sample[i], 2) / (2*bw)); return sum / sqrt(2*M_PI*bw) / p.n; } /* Computes an approximation to the asymptotic median * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` * times. * * The functions returns an `uncert` pair of mean and * stdev of the medians computed on each sample. */ uncert bootstrap_median( const gsl_rng *r, double *sample, size_t n, size_t boots) { /* We use a running statistics to not * store the full resampled array. */ gsl_rstat_workspace* w = gsl_rstat_alloc(); double *values = calloc(boots, sizeof(double)); double *resample = calloc(n, sizeof(double)); for (size_t i = 0; i < boots; i++) { /* The sampling is simply done by generating * an array index uniformly in [0, n-1]. */ for (size_t j = 0; j < n; j++) { size_t choice = gsl_rng_uniform_int(r, n); resample[j] = sample[choice]; } values[i] = gsl_stats_median(resample, 1, n); } /* Compute mean and stdev of the medians * of each newly bootstrapped sample. */ uncert median; median.n = gsl_stats_mean(values, 1, boots); median.s = gsl_stats_sd(values, 1, boots); // free memory gsl_rstat_free(w); free(values); return median; } /* Computes an approximation to the asymptotic mode * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` * times. * * The functions returns an `uncert` pair of mean and * stddev of the modes computed on each sample. */ uncert bootstrap_mode( const gsl_rng *r, double *sample, size_t n, size_t boots) { double *values = calloc(boots, sizeof(double)); double *boot = calloc(n, sizeof(double)); for (size_t i = 0; i < boots; i++) { /* The sampling is simply done by generating * an array index uniformely in [0, n-1]. */ for (size_t j = 0; j < n; j++) { size_t choice = gsl_rng_uniform_int(r, n); boot[j] = sample[choice]; } qsort(boot, n, sizeof(double), cmp_double); values[i] = hsm(boot, n); } /* Compute mean and stddev of the modes * of each newly bootstrapped sample. */ uncert mode; mode.n = gsl_stats_mean(values, 1, boots); mode.s = gsl_stats_sd(values, 1, boots); // free memory free(values); free(boot); return mode; } /* Computes an approximation to the asymptotic fwhm * and its standard deviation by bootstrapping (ie * repeated resampling) the original `sample`, `boots` * times. * * `min,max` are the bounds of interval that is * expected to contain the mode of the sample. * * The functions returns an `uncert` pair of mean and * stddev of the fwhms computed on each sample. */ uncert bootstrap_fwhm( const gsl_rng *r, double min, double max, double *sample, size_t n, size_t boots) { /* We use a running statistics to compute * a trimmed mean/variance of the sample. */ gsl_rstat_workspace* w = gsl_rstat_alloc(); // fwhm values and resample double *values = calloc(boots, sizeof(double)); double *boot = calloc(n, sizeof(double)); // set the size here because it's fixed struct kde_params p; p.n = n; // struct for numeric_fwhm gsl_function pdf; pdf.function = &gauss_kde; for (size_t i = 0; i < boots; i++) { gsl_rstat_reset(w); /* The sampling is simply done by generating * an array index uniformely in [0, n-1]. */ for (size_t j = 0; j < n; j++) { double x = sample[gsl_rng_uniform_int(r, n)]; boot[j] = x; /* Remove too extreme values from the * the trimmed statistics workspace. */ if (fabs(x) < 10) gsl_rstat_add(x, w); } /* Set the trimmed variance of the sample as * the (uncorrected) variance of the gaussian * kernel estimator. */ p.sample = boot; p.var = gsl_rstat_variance(w); pdf.params = &p; values[i] = numeric_fwhm(min, max, &pdf, 0); } /* Compute mean and stddev of the fwhms * of each newly bootstrapped sample. */ uncert fwhm; fwhm.n = gsl_stats_mean(values, 1, boots); fwhm.s = gsl_stats_sd(values, 1, boots); // free memory free(values); free(boot); gsl_rstat_free(w); return fwhm; } /* Computes an approximation to the asymptotic right * tail-index and its standard deviation by bootstrapping * (ie repeated resampling) the original `sample`, `boots` * times. The tail-index is estimated by the Hill estimator, * hence the name of the function. See `hill_estimator()` * for the parameter `t` meaning. * * The functions returns an `uncert` pair of mean and * stddev of the modes computed on each sample. */ uncert bootstrap_hill( const gsl_rng *r, double t, double *sample, size_t n, size_t boots) { double *values = calloc(boots, sizeof(double)); double *boot = calloc(n, sizeof(double)); for (size_t i = 0; i < boots; i++) { /* The sampling is simply done by generating * an array index uniformely in [0, n-1]. */ for (size_t j = 0; j < n; j++) { size_t choice = gsl_rng_uniform_int(r, n); boot[j] = sample[choice]; } qsort(boot, n, sizeof(double), cmp_double); values[i] = hill_estimator(boot, t, n); } /* Compute mean and stddev of the modes * of each newly bootstrapped sample. */ uncert index; index.n = gsl_stats_mean(values, 1, boots); index.s = gsl_stats_sd(values, 1, boots); // free memory free(values); free(boot); return index; }