analistica/ex-1/bootstrap.c

#include <stdlib.h>
#include <math.h>
#include <gsl/gsl_roots.h>
#include <gsl/gsl_rstat.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_statistics_double.h>

#include "bootstrap.h"
#include "tests.h"


/* Function that compares doubles for sorting:
 * x > y  ⇒ 1
 * x == y ⇒ 0
 * x < y  ⇒ -1
 */
int cmp_double (const void *xp, const void *yp) {
  double x = *(double*)xp,
         y = *(double*)yp;
  return x > y ? 1 : (x == y ? 0 : -1);
}


/* Returns the (rounded) mean index of all
 * components of `v` that are equal to `x`.
 * This function is used to handle duplicate
 * data (called "ties") in `hsm()`.
 */
size_t mean_index(gsl_vector *v, double x) {
  gsl_rstat_workspace *w = gsl_rstat_alloc();

  for (size_t i = 0; i < v->size; i++) {
    if (gsl_vector_get(v, i) == x)
      gsl_rstat_add((double)i, w);
  }
  int mean = gsl_rstat_mean(w);
  gsl_rstat_free(w);

  return round(mean);
}


/* Computes the half-sample mode (also called the Robertson-Cryer
 * mode estimator) of the sample `x` containing `n` observations.
 * Important: the sample is assumed to be sorted.
 *
 * It is based on repeatedly finding the half modal interval (smallest
 * interval containing half of the observations) of the sample.
 * This implementation is based on the `hsm()` function from the
 * modeest[1] R package.
 *
 * [1]: https://rdrr.io/cran/modeest/man/hsm.html
 */
double hsm(double *x, size_t n) {
  int i, k;
  gsl_vector *diffs_full = gsl_vector_calloc(n-n/2);

  /* Divide the sample in two halves and compute
   * the paired differences between the upper and
   * lower halves. The index of the min diff. gives
   * the start of the modal interval. Repeat on the
   * new interval until three or less points are left.
   */
  while (n > 3) {
    k = n/2;

    // lower/upper halves of x
    gsl_vector upper = gsl_vector_view_array(x+k, n-k).vector;
    gsl_vector lower = gsl_vector_view_array(x,   n-k).vector;

    // restrict diffs_full to length n-k
    gsl_vector diffs = gsl_vector_subvector(diffs_full, 0, n-k).vector;

    // compute the difference upper-lower
    gsl_vector_memcpy(&diffs, &upper);
    gsl_vector_sub(&diffs, &lower);

    // find minimum while handling ties
    i = mean_index(&diffs, gsl_vector_min(&diffs));

    /* If the minumium difference is 0 we found
     * the hsm so we set n=1 to break the loop.
     */
    x += i;
    n = (gsl_vector_get(&diffs, i) == 0) ? 1 : k;
  }

  // free memory
  gsl_vector_free(diffs_full);

  /* If the sample is has three points the hsm
   * is the average of the two closer ones.
   */
  if (n == 3) {
    if (2*x[1] - x[0] - x[2] > 0)
      return gsl_stats_mean(x+1, 1, 2);
    return gsl_stats_mean(x, 1, 2);
  }

  /* Otherwise (smaller than 3) the hsm is just
   * the mean of the points.
   */
  return gsl_stats_mean(x, 1, n);
}


/* Computes the Hill estimator γ_k of the right
 * tail-index of a distribution from a sample `x`
 * of `n` observations by using the last k outliers.
 *
 * k is computed from t as k = n^`t`, so it is
 * expected that t ∈ (0,1).
 *
 * Important: the sample is assumed to be sorted.
 */
double hill_estimator(double *x, double t, size_t n) {
  double k = pow(n, t);
  double sum = 0;
  for (size_t i = n - k; i < n; i++) {
    sum += log(x[i]) - log(x[n - (size_t)round(k)]);
  }

  return k / sum;
}


/* Parameters needed to compute the
 * KDE function of a sample.
 */
struct kde_params {
  double n;        // sample size
  double var;      // sample variance
  double *sample;  // sample
};


/* A KDE (kernel density estimation) is an
 * approximation of a sample PDF obtained by
 * convolving the sample points with a smooth
 * kernel, in this case a standard gaussian.
 *
 * `gauss_kde(x, p)` gives the value of the PDF
 * at `x`, where `p` is a `kde_params` struct.
 */
double gauss_kde(double x, void * params) {
  struct kde_params p = *((struct kde_params*) params);

  /* Apply the Silverman's rule of thumb to the
   * bandwidth estimation. The badwidth is given
   * by the sample variance times a factor which
   * depends on the number of points and dimension.
   */
  double bw = 0.4 * p.var * pow((double)p.n*3.0/4, -2.0/5);

  double sum = 0;
  for (size_t i = 0; i < p.n; i++)
    sum += exp(-pow(x - p.sample[i], 2) / (2*bw));

  return sum / sqrt(2*M_PI*bw) / p.n;
}


/* Computes an approximation to the asymptotic median
 * and its standard deviation by bootstrapping (ie
 * repeated resampling) the original `sample`, `boots`
 * times.
 *
 * The functions returns an `uncert` pair of mean and
 * stdev of the medians computed on each sample.
 */
uncert bootstrap_median(
    const gsl_rng *r,
    double *sample, size_t n,
    size_t boots) {

  /* We use a running statistics to not
   * store the full resampled array.
   */
  gsl_rstat_workspace* w = gsl_rstat_alloc();

  double *values = calloc(boots, sizeof(double));

  for (size_t i = 0; i < boots; i++) {
    /* The sampling is simply done by generating
     * an array index uniformly in [0, n-1].
     */
    for (size_t j = 0; j < n; j++) {
      size_t choice = gsl_rng_uniform_int(r, n);
      gsl_rstat_add(sample[choice], w);
    }
    values[i] = gsl_rstat_median(w);
  }

  /* Compute mean and stdev of the medians
   * of each newly bootstrapped sample.
   */
  uncert median;
  median.n = gsl_stats_mean(values, 1, boots);
  median.s = gsl_stats_sd(values, 1, boots);

  // free memory
  gsl_rstat_free(w);
  free(values);

  return median;
}


/* Computes an approximation to the asymptotic mode
 * and its standard deviation by bootstrapping (ie
 * repeated resampling) the original `sample`, `boots`
 * times.
 *
 * The functions returns an `uncert` pair of mean and
 * stddev of the modes computed on each sample.
 */
uncert bootstrap_mode(
    const gsl_rng *r,
    double *sample, size_t n,
    size_t boots) {

  double *values = calloc(boots, sizeof(double));
  double *boot = calloc(n, sizeof(double));

  for (size_t i = 0; i < boots; i++) {
    /* The sampling is simply done by generating
     * an array index uniformely in [0, n-1].
     */
    for (size_t j = 0; j < n; j++) {
      size_t choice = gsl_rng_uniform_int(r, n);
      boot[j] = sample[choice];
    }
    qsort(boot, n, sizeof(double), cmp_double);
    values[i] = hsm(boot, n);
  }

  /* Compute mean and stddev of the modes
   * of each newly bootstrapped sample.
   */
  uncert mode;
  mode.n = gsl_stats_mean(values, 1, boots);
  mode.s = gsl_stats_sd(values, 1, boots);

  // free memory
  free(values);
  free(boot);

  return mode;
}


/* Computes an approximation to the asymptotic fwhm
 * and its standard deviation by bootstrapping (ie
 * repeated resampling) the original `sample`, `boots`
 * times.
 *
 * `min,max` are the bounds of interval that is
 * expected to contain the mode of the sample.
 *
 * The functions returns an `uncert` pair of mean and
 * stddev of the fwhms computed on each sample.
 */
uncert bootstrap_fwhm(
    const gsl_rng *r,
    double min, double max,
    double *sample, size_t n,
    size_t boots) {
  /* We use a running statistics to compute
   * a trimmed mean/variance of the sample.
   */
  gsl_rstat_workspace* w = gsl_rstat_alloc();

  // fwhm values and resample
  double *values = calloc(boots, sizeof(double));
  double *boot = calloc(n, sizeof(double));

  // set the size here because it's fixed
  struct kde_params p;
  p.n = n;

  // struct for numeric_fwhm
  gsl_function pdf;
  pdf.function = &gauss_kde;

  for (size_t i = 0; i < boots; i++) {
    gsl_rstat_reset(w);

    /* The sampling is simply done by generating
     * an array index uniformely in [0, n-1].
     */
    for (size_t j = 0; j < n; j++) {
      double x = sample[gsl_rng_uniform_int(r, n)];
      boot[j]  = x;
      /* Remove too extreme values from the
       * the trimmed statistics workspace.
       */
      if (fabs(x) < 10) gsl_rstat_add(x, w);
    }

    /* Set the trimmed variance of the sample as
     * the (uncorrected) variance of the gaussian
     * kernel estimator.
     */
    p.sample = boot;
    p.var = gsl_rstat_variance(w);
    pdf.params = &p;

    values[i] = numeric_fwhm(min, max, &pdf, 0);
  }

  /* Compute mean and stddev of the fwhms
   * of each newly bootstrapped sample.
   */
  uncert fwhm;
  fwhm.n = gsl_stats_mean(values, 1, boots);
  fwhm.s = gsl_stats_sd(values, 1, boots);

  // free memory
  free(values);
  free(boot);
  gsl_rstat_free(w);

  return fwhm;
}


/* Computes an approximation to the asymptotic right
 * tail-index and its standard deviation by bootstrapping
 * (ie repeated resampling) the original `sample`, `boots`
 * times. The tail-index is estimated by the Hill estimator,
 * hence the name of the function. See `hill_estimator()`
 * for the parameter `t` meaning.
 *
 * The functions returns an `uncert` pair of mean and
 * stddev of the modes computed on each sample.
 */
uncert bootstrap_hill(
    const gsl_rng *r,
    double t,
    double *sample, size_t n,
    size_t boots) {

  double *values = calloc(boots, sizeof(double));
  double *boot = calloc(n, sizeof(double));

  for (size_t i = 0; i < boots; i++) {
    /* The sampling is simply done by generating
     * an array index uniformely in [0, n-1].
     */
    for (size_t j = 0; j < n; j++) {
      size_t choice = gsl_rng_uniform_int(r, n);
      boot[j] = sample[choice];
    }
    qsort(boot, n, sizeof(double), cmp_double);
    values[i] = hill_estimator(boot, t, n);
  }

  /* Compute mean and stddev of the modes
   * of each newly bootstrapped sample.
   */
  uncert index;
  index.n = gsl_stats_mean(values, 1, boots);
  index.s = gsl_stats_sd(values, 1, boots);

  // free memory
  free(values);
  free(boot);

  return index;
}