analistica/ex-1/tests.c

/* This file contains functions to perform
 * statistical tests on the points sampled
 * from a Landau distribution.
 */

#include <stdio.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_sf_gamma.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_min.h>
#include <gsl/gsl_roots.h>
#include <gsl/gsl_sum.h>

#include "landau.h"
#include "bootstrap.h"


/* Kolmogorov distribution CDF
 * for sample size n and statistic D
 */
double kolmogorov_cdf(double D, int n) {
  double x = sqrt(n) * D;

  // trick to reduce estimate error
  x += 1/(6 * sqrt(n)) + (x - 1)/(4 * n);

  // calculate the first n_terms of the series
  // Σ_k=1 exp(-(2k - 1)²π²/8x²)
  size_t n_terms = 30;
  double *terms = calloc(n_terms, sizeof(double));
  for (size_t k=0; k<n_terms; k++) {
    terms[k] = exp(-pow((2*(double)(k + 1) - 1)*M_PI/x, 2) / 8);
  }

  // do a transform to accelerate the convergence
  double sum, abserr;
  gsl_sum_levin_utrunc_workspace* s = gsl_sum_levin_utrunc_alloc(n_terms);
  gsl_sum_levin_utrunc_accel(terms, n_terms, s, &sum, &abserr);

  fprintf(stderr, "\n## Kolmogorov CDF\n");
  fprintf(stderr, "accel sum: %g\n", sum);
  fprintf(stderr, "plain sum: %g\n", s->sum_plain);
  fprintf(stderr, "err: %g\n",   abserr);

  gsl_sum_levin_utrunc_free(s);
  free(terms);

  return sqrt(2*M_PI)/x * sum;
}


/* This is a high-order function (ie a function that operates
 * on functions) in disguise. It takes a function f and produces
 * a function that computes -f. In lambda calculus it would be
 * the map λf.λx -f(x).
 *
 * Since there is no notion of lambda functions in C (the
 * standard one, at least) we use a trick involving the
 * gsl_function struct: `negate_func(x, fp)` takes a point `x`
 * and a gsl_function `fp` as the usual void pointer `params`.
 * It then calls the function `fp.function` with `x` and their
 * `fp.params` and return the negated result.
 *
 * So, given a `gsl_function f` its negated function is
 * contructed as follows:
 *
 *    gsl_function nf;
 *    nf.function = &negate_func;
 *    nf.params = &f;
 */
double negate_func(double x, void * fp) {
  gsl_function f = *((gsl_function*) fp);
  return -f.function(x, f.params);
}


/* Numerically computes the mode of a Landau
 * distribution by maximising the derivative.
 * The `min,max` parameters are the initial search
 * interval for the optimisation.
 *
 * If `err` is true print the estimate error.
 */
double numeric_mode(double min, double max,
                    gsl_function *pdf,
                    int err) {

  /* Negate the PDF to maximise it by
   * using a GSL minimisation method.
   * (There usually are no maximisation methods)
   */
  gsl_function npdf;
  npdf.function = &negate_func;
  npdf.params   = pdf;

  // initialize minimization
  double x = 0;
  int    max_iter = 100;
  double prec = 1e-7;
  int    status = GSL_CONTINUE;
  const gsl_min_fminimizer_type *T = gsl_min_fminimizer_brent;
  gsl_min_fminimizer *s = gsl_min_fminimizer_alloc(T);
  gsl_min_fminimizer_set(s, &npdf, x, min, max);

  // minimisation
  for (int iter = 0; status == GSL_CONTINUE && iter < max_iter; iter++)
    {
    status = gsl_min_fminimizer_iterate(s);
    x      = gsl_min_fminimizer_x_minimum(s);
    min    = gsl_min_fminimizer_x_lower(s);
    max    = gsl_min_fminimizer_x_upper(s);
    status = gsl_min_test_interval(min, max, 0, prec);
    }

  /* The error is simply given by the width of
   * the final interval containing the solution
   */
  if (err)
    fprintf(stderr, "mode error: %.3g\n", max - min);

  // free memory
  gsl_min_fminimizer_free(s);
  return x;
}


/* A structure containing the half-maximum
 * of a PDF and a gsl_function of the PDF
 * itself, used by `numeric_fwhm` to compute
 * the FWHM.
 */
struct fwhm_params {
  double halfmax;
  gsl_function *pdf;
};


/* This is the implicit equation that is solved
 * in `numeric_fwhm` by a numerical root-finding
 * method. This function takes a point `x`, the
 * parameters defined in `fwhm_params` and returns
 * the value of:
 *
 *    f(x) - f(max)/2
 *
 * where f is the PDF of interest.
 */
double fwhm_equation(double x, void* params) {
  struct fwhm_params p = *((struct fwhm_params*) params);
  return p.pdf->function(x, p.pdf->params) - p.halfmax;
}


/* Numerically computes the FWHM of a PDF
 * distribution using the definition.
 * The `min,max` parameters are the initial search
 * interval for the root search of equation
 * `fwhm_equation`. Two searches are performed in
 * [min, mode] and [mode, max] that will give the
 * two solutions x₋, x₊. The FWHM is then x₊-x₋.
 *
 * If `err` is true print the estimate error.
 */
double numeric_fwhm(double min, double max,
                    gsl_function *pdf,
                    int err) {
  /* Create the gls_function structure
   * for the equation to be solved.
   */
  double mode = numeric_mode(min, max, pdf, 0);
  struct fwhm_params p = {
    pdf->function(mode, pdf->params)/2,
    pdf
  };
  gsl_function equation;
  equation.function = &fwhm_equation;
  equation.params   = &p;

  const gsl_root_fsolver_type *T = gsl_root_fsolver_brent;
  gsl_root_fsolver *s = gsl_root_fsolver_alloc(T);

  // initialize minimization for x₋
  double x, fmin, fmax;
  int    max_iter = 100;
  double prec = 1e-7;
  int    status;

  // minimization
  gsl_root_fsolver_set(s, &equation, min, mode);
  status = GSL_CONTINUE;
  for (int iter = 0; status == GSL_CONTINUE && iter < max_iter; iter++)
  { status = gsl_root_fsolver_iterate(s);
    x      = gsl_root_fsolver_root(s);
    fmin   = gsl_root_fsolver_x_lower(s);
    fmax   = gsl_root_fsolver_x_upper(s);
    status = gsl_min_test_interval(fmin, fmax, 0, prec);
  }
  double x_low = x;
  double err_low = fmax - fmin;

  // initialize minimization for x₊
  gsl_root_fsolver_set(s, &equation, mode, max);

  // minimization
  status = GSL_CONTINUE;
  for (int iter = 0; status == GSL_CONTINUE && iter < max_iter; iter++)
  { status = gsl_root_fsolver_iterate(s);
    x      = gsl_root_fsolver_root(s);
    fmin   = gsl_root_fsolver_x_lower(s);
    fmax   = gsl_root_fsolver_x_upper(s);
    status = gsl_min_test_interval(fmin, fmax, 0, prec);
  }
  double x_upp = x;
  double err_upp = fmax - fmin;

  if (err)
    fprintf(stderr, "fhwm error: %g\n",
            sqrt(err_low*err_low + err_upp*err_upp));

  // free memory
  gsl_root_fsolver_free(s);
  return x_upp - x_low;
}


/* The absolute moment of given `order`
 * of the normal distribution N(0, 1).
 *
 *  μ_k = E[|X|^k] = 2^(k/2) Γ((k+1)/2) / √π
 */
double normal_moment(double order)  {
  return pow(2, order/2) * gsl_sf_gamma((order+1)/2) / sqrt(M_PI);
}


/* Trapani infinite moment test
 *
 * Tests whether the `order`-th moment of the
 * distribution of `sample` is infinite.
 * The test generates an artificial sample of
 * r points, where r = n^θ and returns a p-value.
 */
double trapani(gsl_rng *rng, double theta, double psi,
               int order, double *sample, size_t n) {
  fprintf(stderr, "\n## Trapani test (k = %d)\n", order);

  /* Compute the moment μ_k and rescale it
   * to make it scale-free. This is done by
   * also computing the moment μ_ψ and taking
   *
   *   m_k = μ_k / μ_ψ^(k/ψ)
   *
   * where:
   *   1. ψ ∈ (0, k)
   *   2. k is the `order` of the moment
   */

  double moment = 0;
  double moment_psi = 0;

  for (size_t i = 0; i < n; i++) {
    moment += pow(fabs(sample[i]), order);
    moment_psi += pow(fabs(sample[i]), psi);
  }
  moment /= n;
  moment_psi /= n;
  fprintf(stderr, "%d-th sample moment: %.2g\n", order, moment);

  /* Rescale the moment
   * Note: k/ψ = 2 becase ψ=k/2
   */
  if (psi > 0) {
    fprintf(stderr, "%g-th sample moment: %.2g\n", psi, moment_psi);
    moment /= pow(moment_psi, order/psi);

    /* Rescale further by the standard gaussian
     * moments.
     *
     * m_k' = m_k (μ_ψ)^(k/ψ) / μ_k
     */
    moment *= pow(normal_moment(psi), order/psi) / normal_moment(order);

    fprintf(stderr, "rescaled moment: %.2g\n", moment);
  }

  /* Generate r points from a standard
   * normal distribution and multiply
   * them by √(e^m_k).
   */
  size_t r = round(pow(n, theta));

  fprintf(stderr, "n: %ld\n", n);
  fprintf(stderr, "r: %ld\n", r);
  double factor = sqrt(exp(moment));
  double *noise = calloc(r, sizeof(double));
  for (size_t i = 0; i < r; i++) {
    noise[i] = factor * gsl_ran_gaussian(rng, 1); // σ = 1
  }

  /* Compute the intel of the squared
   * sum of the residues:
   *
   * Θ = ∫[-L, L] sum(u) φ(u)du
   *
   * where sum(u) = 2/√r (Σ_j=0 ^r θ(u - noise[j]) - 1/2)
   *
   * L = 1 and φ(u)=1/2L is a good enough choice.
   */

  // sort the noise sample
  qsort(noise, r, sizeof(double), &cmp_double);

  double integral = 0;
  double L = 1;
  for (size_t i = 0; i < r; i++) {
    if (noise[i] < -L) {
      if (i+1 < r && noise[i+1] < -L)
        factor = 0;
      else if (i+1 < r && noise[i+1] < L)
        factor = noise[i+1] + L;
      else
        factor = 2*L;
    }
    else if (noise[i] < L) {
      if (i+1 < r && noise[i+1] < L)
        factor = noise[i+1] - noise[i];
      else
        factor = L - noise[i];
    }
    else
      factor = 0;
    integral += (i + 1)*((i + 1)/(double)r - 1) * factor;
  }
  integral = r + 2/L * integral;
  fprintf(stderr, "Θ=%g\n", integral);

  // free memory
  free(noise);

  /* Assuming Θ is distributed as a χ² with
   * 1 DoF, compute the the p-value:
   *
   *  p = P(χ² > Θ ; ν=1)
   */
  return gsl_cdf_chisq_Q(integral, 1);
}