49 lines
1.3 KiB
C
49 lines
1.3 KiB
C
#include "dist.h"
|
|
|
|
/* Computes the earth mover's distance, aka 1-Wasserstein
|
|
* metric, of two histograms. It gives the minimum cost
|
|
* (weight * distance) associated to shifting around the
|
|
* weights of the bins to turn one histogram into the other.
|
|
*
|
|
* For a 1D normalised histogram the distance can be proven
|
|
* to be equal to the L¹ distance of the CDF. If the bins are
|
|
* equally sized this can be futher simplified to give a simple
|
|
* O(n) solution.
|
|
*/
|
|
double emd_between(gsl_histogram *a, gsl_histogram *b) {
|
|
/* For the distance to make sense the
|
|
* histograms must have the same area.
|
|
*/
|
|
normalise(a);
|
|
normalise(b);
|
|
|
|
/* Get the bin width used to compute the cost,
|
|
* this assumes the histogram have equal ranges
|
|
* and all bin widths are equal.
|
|
*/
|
|
double lower, upper;
|
|
gsl_histogram_get_range(a, 0, &lower, &upper);
|
|
double width = upper - lower;
|
|
|
|
double sum = 0;
|
|
double diff = 0;
|
|
for (int i = 0; i < a->n; i++) {
|
|
diff += gsl_histogram_get(a, i) - gsl_histogram_get(b, i);
|
|
sum += fabs(diff);
|
|
}
|
|
|
|
/* Multiply the sum by the bin width to
|
|
* compute the final cost
|
|
*/
|
|
return sum * width;
|
|
}
|
|
|
|
|
|
/* Normalise a histogram: ie rescaling the bin
|
|
* weights to sum to 1
|
|
*/
|
|
void normalise(gsl_histogram *h) {
|
|
double sum = gsl_histogram_sum(h);
|
|
gsl_histogram_scale(h, 1/sum);
|
|
}
|