analistica/ex-6/dist.c
2020-07-05 11:35:48 +02:00

49 lines
1.3 KiB
C

#include "dist.h"
/* Computes the earth mover's distance, aka 1-Wasserstein
* metric, of two histograms. It gives the minimum cost
* (weight * distance) associated to shifting around the
* weights of the bins to turn one histogram into the other.
*
* For a 1D normalised histogram the distance can be proven
* to be equal to the L¹ distance of the CDF. If the bins are
* equally sized this can be futher simplified to give a simple
* O(n) solution.
*/
double emd_between(gsl_histogram *a, gsl_histogram *b) {
/* For the distance to make sense the
* histograms must have the same area.
*/
normalise(a);
normalise(b);
/* Get the bin width used to compute the cost,
* this assumes the histogram have equal ranges
* and all bin widths are equal.
*/
double lower, upper;
gsl_histogram_get_range(a, 0, &lower, &upper);
double width = upper - lower;
double sum = 0;
double diff = 0;
for (int i = 0; i < a->n; i++) {
diff += gsl_histogram_get(a, i) - gsl_histogram_get(b, i);
sum += fabs(diff);
}
/* Multiply the sum by the bin width to
* compute the final cost
*/
return sum * width;
}
/* Normalise a histogram: ie rescaling the bin
* weights to sum to 1
*/
void normalise(gsl_histogram *h) {
double sum = gsl_histogram_sum(h);
gsl_histogram_scale(h, 1/sum);
}