From 29dda3e15105e2b6b949b318df0173141d349b99 Mon Sep 17 00:00:00 2001
From: rnhmjoj <rnhmjoj@inventati.org>
Date: Wed, 8 Apr 2020 12:33:11 +0200
Subject: [PATCH] ex-1: implement a KDE-based estimation of the FWHM

---
 ex-1/bootstrap.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++
 ex-1/bootstrap.h |  18 ++++++++
 ex-1/main.c      |  19 ++++----
 ex-1/tests.c     |  43 +++++++++++-------
 ex-1/tests.h     |  12 +++--
 5 files changed, 179 insertions(+), 26 deletions(-)

diff --git a/ex-1/bootstrap.c b/ex-1/bootstrap.c
index e033941..ca69985 100644
--- a/ex-1/bootstrap.c
+++ b/ex-1/bootstrap.c
@@ -1,10 +1,12 @@
 #include <stdlib.h>
 #include <math.h>
+#include <gsl/gsl_roots.h>
 #include <gsl/gsl_rstat.h>
 #include <gsl/gsl_vector.h>
 #include <gsl/gsl_statistics_double.h>
 
 #include "bootstrap.h"
+#include "tests.h"
 
 
 /* Function that compares doubles for sorting:
@@ -101,6 +103,42 @@ double hsm(double *x, size_t n) {
 }
 
 
+/* Parameters needed to compute the
+ * KDE function of a sample.
+ */
+struct kde_params {
+  double n;        // sample size
+  double var;      // sample variance
+  double *sample;  // sample
+};
+
+
+/* A KDE (kernel density estimation) is an
+ * approximation of a sample PDF obtained by
+ * convolving the sample points with a smooth
+ * kernel, in this case a standard gaussian.
+ *
+ * `gauss_kde(x, p)` gives the value of the PDF
+ * at `x`, where `p` is a `kde_params` struct.
+ */
+double gauss_kde(double x, void * params) {
+  struct kde_params p = *((struct kde_params*) params);
+
+  /* Apply the Silverman's rule of thumb to the
+   * bandwidth estimation. The badwidth is given
+   * by the sample variance times a factor which
+   * depends on the number of points and dimension.
+   */
+  double bw = p.var * pow((double)p.n*3.0/4, -2.0/5);
+
+  double sum = 0;
+  for (size_t i = 0; i < p.n; i++)
+    sum += exp(-pow(x - p.sample[i], 2) / (2*bw));
+
+  return sum / sqrt(2*M_PI*bw) / p.n;
+}
+
+
 /* Computes an approximation to the asymptotic median
  * and its standard deviation by bootstrapping (ie
  * repeated resampling) the original `sample`, `boots`
@@ -188,3 +226,78 @@ uncert bootstrap_mode(
 
   return mode;
 }
+
+
+/* Computes an approximation to the asymptotic fwhm
+ * and its standard deviation by bootstrapping (ie
+ * repeated resampling) the original `sample`, `boots`
+ * times.
+ *
+ * `min,max` are the bounds of interval that is
+ * expected to contain the mode of the sample.
+ *
+ * The functions returns an `uncert` pair of mean and
+ * stddev of the fwhms computed on each sample.
+ */
+uncert bootstrap_fwhm(
+    const gsl_rng *r,
+    double min, double max,
+    double *sample, size_t n,
+    int boots) {
+  /* We use a running statistics to compute 
+   * a trimmed mean/variance of the sample.
+   */
+  gsl_rstat_workspace* w = gsl_rstat_alloc();
+
+  // fwhm values and resample
+  double *values = calloc(boots, sizeof(double));
+  double *boot = calloc(n, sizeof(double));
+
+  // set the size here because it's fixed
+  struct kde_params p;
+  p.n = n;
+
+  // struct for numeric_fwhm
+  gsl_function pdf;
+  pdf.function = &gauss_kde;
+
+  for (size_t i = 0; i < boots; i++) {
+    gsl_rstat_reset(w);
+
+    /* The sampling is simply done by generating
+     * an array index uniformely in [0, n-1].
+     */
+    for (size_t j = 0; j < n; j++) {
+      double x = sample[gsl_rng_uniform_int(r, n)];
+      boot[j]  = x;
+      /* Remove too extreme values from the
+       * the trimmed statistics workspace.
+       */
+      if (fabs(x) < 10) gsl_rstat_add(x, w);
+    }
+
+    /* Set the trimmed variance of the sample as
+     * the (uncorrected) variance of the gaussian
+     * kernel estimator.
+     */
+    p.sample = boot;
+    p.var = gsl_rstat_variance(w);
+    pdf.params = &p;
+
+    values[i] = numeric_fwhm(min, max, &pdf, 0);
+  }
+
+  /* Compute mean and stddev of the fwhms
+   * of each newly bootstrapped sample.
+   */
+  uncert fwhm;
+  fwhm.n = gsl_stats_mean(values, 1, boots);
+  fwhm.s = gsl_stats_sd(values, 1, boots);
+
+  // free memory
+  free(values);
+  free(boot);
+  gsl_rstat_free(w);
+
+  return fwhm;
+}
diff --git a/ex-1/bootstrap.h b/ex-1/bootstrap.h
index 47fd9d8..7248ccd 100644
--- a/ex-1/bootstrap.h
+++ b/ex-1/bootstrap.h
@@ -45,3 +45,21 @@ uncert bootstrap_mode(
     const gsl_rng *r,
     double *sample, size_t n,
     int boots);
+
+
+/* Computes an approximation to the asymptotic fwhm
+ * and its standard deviation by bootstrapping (ie
+ * repeated resampling) the original `sample`, `boots`
+ * times.
+ *
+ * `min,max` are the bounds of interval that is
+ * expected to contain the mode of the sample.
+ *
+ * The functions returns an `uncert` pair of mean and
+ * stddev of the fwhms computed on each sample.
+ */
+uncert bootstrap_fwhm(
+    const gsl_rng *r,
+    double min, double max,
+    double *sample, size_t n,
+    int boots);
diff --git a/ex-1/main.c b/ex-1/main.c
index f73f1d5..0ef359d 100644
--- a/ex-1/main.c
+++ b/ex-1/main.c
@@ -24,8 +24,8 @@ int main(int argc, char** argv) {
   size_t samples = 100000;
   double* sample = calloc(samples, sizeof(double));
   size_t bins = 40;
-  double min = -20;
-  double max = 20;
+  double min = -10;
+  double max =  10;
   gsl_histogram* hist = gsl_histogram_alloc(bins);
   gsl_histogram_set_ranges_uniform(hist, min, max);
 
@@ -86,7 +86,7 @@ int main(int argc, char** argv) {
   pdf.function = &landau_pdf;
   pdf.params = NULL;
 
-  double mode_e = numeric_mode(min, max, &pdf);
+  double mode_e = numeric_mode(min, max, &pdf, 1);
   uncert mode_o = bootstrap_mode(r, sample, samples, 100);
 
   // print the results
@@ -94,6 +94,7 @@ int main(int argc, char** argv) {
   fprintf(stderr, "expected mode: %.7f\n", mode_e);
   fprintf(stderr, "observed mode: %.4f±%.4f\n", mode_o.n, mode_o.s);
 
+  // t-test
   double t = fabs(mode_e - mode_o.n)/mode_o.s;
   double p = 1 - erf(t/sqrt(2));
   fprintf(stderr, "\n## t-test\n");
@@ -112,16 +113,17 @@ int main(int argc, char** argv) {
    */
   fprintf(stderr, "\n\n# FWHM comparison\n");
 
-  double fwhm_e = numeric_fwhm(min, max, &pdf);
-  //uncert fwhm_o = bootstrap_fwhm(r, sample, samples, 100);
+  double fwhm_e = numeric_fwhm(min, max, &pdf, 1);
+  uncert fwhm_o = bootstrap_fwhm(r, min, max, sample, samples, 100);
 
   // print the results
   fprintf(stderr, "\n## Results\n");
   fprintf(stderr, "expected fwhm: %.7f\n", fwhm_e);
-  //fprintf(stderr, "observed fwhm: %.4f±%.4f\n", fwhm_o.n, fwhm_o.s);
+  fprintf(stderr, "observed fwhm: %.4f±%.4f\n", fwhm_o.n, fwhm_o.s);
 
-  //t = fabs(fwhm_e - fwhm_o.n)/fwhm_o.s;
-  //p = 1 - erf(t/sqrt(2));
+  // t-test
+  t = fabs(fwhm_e - fwhm_o.n)/fwhm_o.s;
+  p = 1 - erf(t/sqrt(2));
   fprintf(stderr, "\n## t-test\n");
   fprintf(stderr, "t=%.3f\n", t);
   fprintf(stderr, "p=%.3f\n", p);
@@ -141,6 +143,7 @@ int main(int argc, char** argv) {
   fprintf(stderr, "expected median: %.7f\n", med_e);
   fprintf(stderr, "observed median: %.4f±%.4f\n", med_o.n, med_o.s);
 
+  // t-test
   t = fabs(med_e - med_o.n)/med_o.s;
   p = 1 - erf(t/sqrt(2));
   fprintf(stderr, "\n## t-test\n");
diff --git a/ex-1/tests.c b/ex-1/tests.c
index 95d2035..9d54ba7 100644
--- a/ex-1/tests.c
+++ b/ex-1/tests.c
@@ -74,11 +74,14 @@ double negate_func(double x, void * fp) {
 
 /* Numerically computes the mode of a Landau
  * distribution by maximising the derivative.
- * The min,max parameters are the initial search
+ * The `min,max` parameters are the initial search
  * interval for the optimisation.
+ *
+ * If `err` is true print the estimate error.
  */
 double numeric_mode(double min, double max,
-                    gsl_function *pdf) {
+                    gsl_function *pdf,
+                    int err) {
 
   /* Negate the PDF to maximise it by
    * using a GSL minimisation method.
@@ -89,22 +92,22 @@ double numeric_mode(double min, double max,
   npdf.params   = pdf;
 
   // initialize minimization
-  double guess = 0;
+  double x = 0;
   int    iter = 0;
   int    max_iter = 100;
   double prec = 1e-7;
   int    status;
-  const gsl_min_fminimizer_type * T = gsl_min_fminimizer_brent;
-  gsl_min_fminimizer * s = gsl_min_fminimizer_alloc(T);
-  gsl_min_fminimizer_set(s, &npdf, guess, min, max);
+  const gsl_min_fminimizer_type *T = gsl_min_fminimizer_brent;
+  gsl_min_fminimizer *s = gsl_min_fminimizer_alloc(T);
+  gsl_min_fminimizer_set(s, &npdf, x, min, max);
 
   // minimisation
   do {
       iter++;
       status = gsl_min_fminimizer_iterate(s);
-      guess = gsl_min_fminimizer_x_minimum(s);
-      min = gsl_min_fminimizer_x_lower(s);
-      max = gsl_min_fminimizer_x_upper(s);
+      x      = gsl_min_fminimizer_x_minimum(s);
+      min    = gsl_min_fminimizer_x_lower(s);
+      max    = gsl_min_fminimizer_x_upper(s);
       status = gsl_min_test_interval(min, max, prec, prec);
 
   } while (status == GSL_CONTINUE && iter < max_iter);
@@ -112,11 +115,12 @@ double numeric_mode(double min, double max,
   /* The error is simply given by the width of
    * the final interval containing the solution
    */
-  fprintf(stderr, "mode error: %.3g\n", max - min);
+  if (err)
+    fprintf(stderr, "mode error: %.3g\n", max - min);
 
   // free memory
   gsl_min_fminimizer_free(s);
-  return guess;
+  return x;
 }
 
 
@@ -154,13 +158,16 @@ double fwhm_equation(double x, void* params) {
  * `fwhm_equation`. Two searches are performed in
  * [min, mode] and [mode, max] that will give the
  * two solutions x₋, x₊. The FWHM is then x₊-x₋.
+ *
+ * If `err` is true print the estimate error.
  */
 double numeric_fwhm(double min, double max,
-                    gsl_function *pdf) {
+                    gsl_function *pdf,
+                    int err) {
   /* Create the gls_function structure
    * for the equation to be solved.
    */
-  double mode = numeric_mode(min, max, pdf);
+  double mode = numeric_mode(min, max, pdf, 0);
   struct fwhm_params p = {
     pdf->function(mode, pdf->params)/2,
     pdf
@@ -169,8 +176,8 @@ double numeric_fwhm(double min, double max,
   equation.function = &fwhm_equation;
   equation.params   = &p;
 
-  const gsl_root_fsolver_type * T = gsl_root_fsolver_brent;
-  gsl_root_fsolver * s = gsl_root_fsolver_alloc(T);
+  const gsl_root_fsolver_type *T = gsl_root_fsolver_brent;
+  gsl_root_fsolver *s = gsl_root_fsolver_alloc(T);
 
   // initialize minimization for x₋
   double x, fmin, fmax;
@@ -189,6 +196,7 @@ double numeric_fwhm(double min, double max,
     status = gsl_min_test_interval(fmin, fmax, 0, prec);
   }
   double x_low = x;
+  double err_low = fmax - fmin;
 
   // initialize minimization for x₊
   gsl_root_fsolver_set(s, &equation, mode, max);
@@ -203,6 +211,11 @@ double numeric_fwhm(double min, double max,
     status = gsl_min_test_interval(fmin, fmax, 0, prec);
   }
   double x_upp = x;
+  double err_upp = fmax - fmin;
+
+  if (err)
+    fprintf(stderr, "fhwm error: %g\n",
+            sqrt(err_low*err_low + err_upp*err_upp));
 
   // free memory
   gsl_root_fsolver_free(s);
diff --git a/ex-1/tests.h b/ex-1/tests.h
index 9c41cbc..6320040 100644
--- a/ex-1/tests.h
+++ b/ex-1/tests.h
@@ -15,11 +15,14 @@ double kolmogorov_cdf(double D, int n);
 
 /* Numerically computes the mode of a Landau
  * distribution by maximising the derivative.
- * The min,max parameters are the initial search
+ * The `min,max` parameters are the initial search
  * interval for the optimisation.
+ *
+ * If `err` is true print the estimate error.
  */
 double numeric_mode(double min, double max,
-                    gsl_function *pdf);
+                    gsl_function *pdf,
+                    int err);
 
 
 /* Numerically computes the FWHM of a PDF
@@ -29,6 +32,9 @@ double numeric_mode(double min, double max,
  * `fwhm_equation`. Two searches are performed in
  * [min, mode] and [mode, max] that will give the
  * two solutions x₋, x₊. The FWHM is then x₊-x₋.
+ *
+ * If `err` is true print the estimate error.
  */
 double numeric_fwhm(double min, double max,
-                    gsl_function *pdf);
+                    gsl_function *pdf,
+                    int err);