diff --git a/ex-1/bootstrap.c b/ex-1/bootstrap.c index f8388d3..9437bbd 100644 --- a/ex-1/bootstrap.c +++ b/ex-1/bootstrap.c @@ -41,7 +41,8 @@ size_t mean_index(gsl_vector *v, double x) { /* Computes the half-sample mode (also called the Robertson-Cryer - * mode estimator) of the sample `x` containing `n` observations. + * mode estimator) of the sample `x` containing `n` observations. + * Important: the sample is assumed to be sorted. * * It is based on repeatedly finding the half modal interval (smallest * interval containing half of the observations) of the sample. @@ -103,6 +104,26 @@ double hsm(double *x, size_t n) { } +/* Computes the Hill estimator γ_k of the right + * tail-index of a distribution from a sample `x` + * of `n` observations by using the last k outliers. + * + * k is computed from t as k = n^`t`, so it is + * expected that t ∈ (0,1). + * + * Important: the sample is assumed to be sorted. + */ +double hill_estimator(double *x, double t, size_t n) { + double k = pow(n, t); + double sum = 0; + for (size_t i = n - k; i < n; i++) { + sum += log(x[i]) - log(x[n - (size_t)round(k)]); + } + + return k / sum; +} + + /* Parameters needed to compute the * KDE function of a sample. */ @@ -150,7 +171,7 @@ double gauss_kde(double x, void * params) { uncert bootstrap_median( const gsl_rng *r, double *sample, size_t n, - int boots) { + size_t boots) { /* We use a running statistics to not * store the full resampled array. @@ -196,7 +217,7 @@ uncert bootstrap_median( uncert bootstrap_mode( const gsl_rng *r, double *sample, size_t n, - int boots) { + size_t boots) { double *values = calloc(boots, sizeof(double)); double *boot = calloc(n, sizeof(double)); @@ -243,7 +264,7 @@ uncert bootstrap_fwhm( const gsl_rng *r, double min, double max, double *sample, size_t n, - int boots) { + size_t boots) { /* We use a running statistics to compute * a trimmed mean/variance of the sample. */ @@ -301,3 +322,49 @@ uncert bootstrap_fwhm( return fwhm; } + + +/* Computes an approximation to the asymptotic right + * tail-index and its standard deviation by bootstrapping + * (ie repeated resampling) the original `sample`, `boots` + * times. The tail-index is estimated by the Hill estimator, + * hence the name of the function. See `hill_estimator()` + * for the parameter `t` meaning. + * + * The functions returns an `uncert` pair of mean and + * stddev of the modes computed on each sample. + */ +uncert bootstrap_hill( + const gsl_rng *r, + double t, + double *sample, size_t n, + size_t boots) { + + double *values = calloc(boots, sizeof(double)); + double *boot = calloc(n, sizeof(double)); + + for (size_t i = 0; i < boots; i++) { + /* The sampling is simply done by generating + * an array index uniformely in [0, n-1]. + */ + for (size_t j = 0; j < n; j++) { + size_t choice = gsl_rng_uniform_int(r, n); + boot[j] = sample[choice]; + } + qsort(boot, n, sizeof(double), cmp_double); + values[i] = hill_estimator(boot, t, n); + } + + /* Compute mean and stddev of the modes + * of each newly bootstrapped sample. + */ + uncert index; + index.n = gsl_stats_mean(values, 1, boots); + index.s = gsl_stats_sd(values, 1, boots); + + // free memory + free(values); + free(boot); + + return index; +} diff --git a/ex-1/bootstrap.h b/ex-1/bootstrap.h index 7248ccd..9097e5c 100644 --- a/ex-1/bootstrap.h +++ b/ex-1/bootstrap.h @@ -30,7 +30,7 @@ int cmp_double (const void *xp, const void *yp); uncert bootstrap_median( const gsl_rng *r, double *sample, size_t n, - int boots); + size_t boots); /* Computes an approximation to the asymptotic mode @@ -44,7 +44,7 @@ uncert bootstrap_median( uncert bootstrap_mode( const gsl_rng *r, double *sample, size_t n, - int boots); + size_t boots); /* Computes an approximation to the asymptotic fwhm @@ -62,4 +62,21 @@ uncert bootstrap_fwhm( const gsl_rng *r, double min, double max, double *sample, size_t n, - int boots); + size_t boots); + + +/* Computes an approximation to the asymptotic right + * tail-index and its standard deviation by bootstrapping + * (ie repeated resampling) the original `sample`, `boots` + * times. The tail-index is estimated by the Hill estimator, + * hence the name of the function. See `hill_estimator()` + * for the parameter `t` meaning. + * + * The functions returns an `uncert` pair of mean and + * stddev of the modes computed on each sample. + */ +uncert bootstrap_hill( + const gsl_rng *r, + double t, + double *sample, size_t n, + size_t boots);