diff --git a/ex-7/fisher.c b/ex-7/fisher.c index 8216595..25d8279 100644 --- a/ex-7/fisher.c +++ b/ex-7/fisher.c @@ -39,9 +39,9 @@ gsl_vector* normal_mean(struct par *p) { * between the two classes. * The projection vector w is given by * - * w = Sw⁻¹ (μ₂ - μ₁) + * w = Σ_w⁻¹ (μ₂ - μ₁) * - * where Sw = Σ₁ + Σ₂ is the so-called within-class + * where Σ_w = Σ₁ + Σ₂ is the so-called within-class * covariance matrix. */ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { @@ -54,10 +54,11 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { gsl_vector *mu2 = normal_mean(&c2->p); /* Compute the inverse of the within-class - * covariance Sw⁻¹. + * covariance Σ_w⁻¹. * Note: by definition Σ is symmetrical and * positive-definite, so Cholesky is appropriate. */ + gsl_matrix_add(cov1, cov2); gsl_linalg_cholesky_decomp(cov1); gsl_linalg_cholesky_invert(cov1); @@ -67,7 +68,7 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { gsl_vector_memcpy(diff, mu2); gsl_vector_sub(diff, mu1); - /* Finally multiply diff by Sw. + /* Finally multiply diff by Σ_w. * This uses the rather low-level CBLAS * functions gsl_blas_dgemv: * diff --git a/ex-7/iters/iter.py b/ex-7/iters/iter.py new file mode 100644 index 0000000..44e0aae --- /dev/null +++ b/ex-7/iters/iter.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +import numpy as np +import matplotlib.pyplot as plt + + +iter, w_x, w_y, b = np.loadtxt('ex-7/iters/iters.txt') + +plt.figure(figsize=(5, 4)) +plt.rcParams['font.size'] = 8 + +plt.subplot(211) +plt.title('weight vector', loc='right') +plt.plot(iter, w, color='#92182b') +plt.ylabel('w') +plt.xlabel('N') + +plt.subplot(212) +plt.title('bias', loc='right') +plt.plot(iter, b, color='gray') +plt.ylabel('b') +plt.xlabel('N') + +plt.tight_layout() +plt.show() diff --git a/ex-7/iters/iters.txt b/ex-7/iters/iters.txt new file mode 100644 index 0000000..e9db0ec --- /dev/null +++ b/ex-7/iters/iters.txt @@ -0,0 +1,11 @@ +#iters w_x w_y b b + 1 0.427 0.904 1.749 + 2 0.566 0.824 1.445 + 3 0.654 0.756 1.213 + 4 0.654 0.756 1.213 + 5 0.654 0.756 1.213 + 7 0.654 0.756 1.213 + 6 0.654 0.756 1.213 + 8 0.654 0.756 1.213 + 9 0.654 0.756 1.213 +10 0.654 0.756 1.213 diff --git a/ex-7/main.c b/ex-7/main.c index 67f146e..7bc5b76 100644 --- a/ex-7/main.c +++ b/ex-7/main.c @@ -95,7 +95,7 @@ int main(int argc, char **argv) { * dataset to get an approximate * solution in `iter` iterations. */ - fputs("# Perceptron \n\n", stderr); + // fputs("# Perceptron \n\n", stderr); w = percep_train(signal, noise, opts.iter, &cut); } else { @@ -107,20 +107,21 @@ int main(int argc, char **argv) { /* Print the results of the method * selected: weights and threshold. */ + fprintf(stderr, "\n* i: %d\n", opts.iter); fprintf(stderr, "* w: [%.3f, %.3f]\n", gsl_vector_get(w, 0), gsl_vector_get(w, 1)); fprintf(stderr, "* cut: %.3f\n", cut); - gsl_vector_fprintf(stdout, w, "%g"); - printf("%f\n", cut); +// gsl_vector_fprintf(stdout, w, "%g"); +// printf("%f\n", cut); /* Print data to stdout for plotting. * Note: we print the sizes to be able * to set apart the two matrices. */ - printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2); - gsl_matrix_fprintf(stdout, signal->data, "%g"); - gsl_matrix_fprintf(stdout, noise->data, "%g"); +// printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2); +// gsl_matrix_fprintf(stdout, signal->data, "%g"); +// gsl_matrix_fprintf(stdout, noise->data, "%g"); // free memory gsl_rng_free(r); diff --git a/ex-7/plot.py b/ex-7/plot.py index 5cfbdeb..04ea7e7 100755 --- a/ex-7/plot.py +++ b/ex-7/plot.py @@ -7,7 +7,6 @@ def line(x, y, **args): '''line between two points x,y''' plot([x[0], y[0]], [x[1], y[1]], **args) -rcParams['font.size'] = 12 w = loadtxt(sys.stdin, max_rows=2) v = array([[0, -1], [1, 0]]) @ w cut = float(input()) @@ -16,7 +15,11 @@ n, m, d = map(int, input().split()) data = loadtxt(sys.stdin).reshape(n + m, d) signal, noise = data[:n].T, data[n:].T +plt.figure(figsize=(3, 3)) +rcParams['font.size'] = 8 figure() +figure(figsize=(3, 3)) +rcParams['font.size'] = 8 subplot(aspect='equal') scatter(*signal, edgecolor='xkcd:charcoal', c='xkcd:dark yellow', label='signal') @@ -24,18 +27,22 @@ scatter(*noise, edgecolor='xkcd:charcoal', c='xkcd:pale purple', label='noise') line(-20*w, 20*w, c='xkcd:midnight blue', label='projection') line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut') +xlabel('x') +ylabel('y') xlim(-1.5, 8) ylim(-1.5, 8) legend() tight_layout() +savefig('notes/images/7-fisher-plane.pdf') -figure() +plt.figure(figsize=(3, 3)) +rcParams['font.size'] = 8 sig_proj = np.dot(w, signal) noise_proj = np.dot(w, noise) hist(sig_proj, color='xkcd:dark yellow', label='signal') hist(noise_proj, color='xkcd:pale purple', label='noise') -axvline(cut, c='xkcd:scarlet') +axvline(cut, c='xkcd:scarlet', label='cut') +xlabel('projection line') legend() tight_layout() - -show() +savefig('notes/images/7-fisher-proj.pdf') diff --git a/notes/docs/bibliography.bib b/notes/docs/bibliography.bib index 50df816..987787c 100644 --- a/notes/docs/bibliography.bib +++ b/notes/docs/bibliography.bib @@ -140,17 +140,32 @@ @book{hecht02, title={Optics}, + author={Eugene Hecht}, year={2002}, - publisher={Pearson}, - author={Eugene Hecht} + publisher={Pearson} } @article{lucy74, title={An iterative technique for the rectification of observed distributions}, - author={Lucy, Leon B}, + author={Lucy, Leon B.}, journal={The astronomical journal}, volume={79}, pages={745}, year={1974} } + +@book{bishop06, + title={Pattern Recognition and Machine Learning}, + author={Bishop, Christopher M.}, + year={2006}, + pages={186 -- 189}, + publisher={Springer} +} + +@techreport{novikoff63, + title={On convergence proofs for perceptrons}, + author={Novikoff, Albert B}, + year={1963}, + institution={Stanford Researhc INST Menlo Park CA} +} diff --git a/notes/images/7-fisher-plane.pdf b/notes/images/7-fisher-plane.pdf index c74d619..1f31fad 100644 Binary files a/notes/images/7-fisher-plane.pdf and b/notes/images/7-fisher-plane.pdf differ diff --git a/notes/images/7-fisher-proj.pdf b/notes/images/7-fisher-proj.pdf index 21ddf21..b141ae4 100644 Binary files a/notes/images/7-fisher-proj.pdf and b/notes/images/7-fisher-proj.pdf differ diff --git a/notes/images/7-points.pdf b/notes/images/7-points.pdf index 6d8b3f5..1486857 100644 Binary files a/notes/images/7-points.pdf and b/notes/images/7-points.pdf differ diff --git a/notes/sections/7.md b/notes/sections/7.md index 7b6e415..801ed05 100644 --- a/notes/sections/7.md +++ b/notes/sections/7.md @@ -2,9 +2,8 @@ ## Generating points according to Gaussian distributions {#sec:sampling} -The first task of exercise 7 is to generate two sets of 2D points $(x, y)$ -according to two bivariate Gaussian distributions with parameters: - +Two sets of 2D points $(x, y)$ - signal and noise - is to be generated according +to two bivariate Gaussian distributions with parameters: $$ \text{signal} \quad \begin{cases} @@ -21,262 +20,361 @@ $$ \end{cases} $$ -where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ are the standard +where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ for the standard deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate -correlation, hence: - +correlation, namely: $$ \sigma_{xy} = \rho \sigma_x \sigma_y $$ where $\sigma_{xy}$ is the covariance of $x$ and $y$. In the code, default settings are $N_s = 800$ points for the signal and $N_n = -1000$ points for the noise but can be changed from the command-line. Both -samples were handled as matrices of dimension $n$ x 2, where $n$ is the number -of points in the sample. The library `gsl_matrix` provided by GSL was employed -for this purpose and the function `gsl_ran_bivariate_gaussian()` was used for -generating the points. +1000$ points for the noise but can be customized from the input command-line. +Both samples were handled as matrices of dimension $n$ x 2, where $n$ is the +number of points in the sample. The library `gsl_matrix` provided by GSL was +employed for this purpose and the function `gsl_ran_bivariate_gaussian()` was +used for generating the points. An example of the two samples is shown in @fig:points. -![Example of points sorted according to two Gaussian with -the given parameters. Noise points in pink and signal points -in yellow.](images/7-points.pdf){#fig:points} +![Example of points sampled according to the two Gaussian distributions +with the given parameters.](images/7-points.pdf){#fig:points} Assuming not to know how the points were generated, a model of classification -must then be implemented in order to assign each point to the right class +is then to be implemented in order to assign each point to the right class (signal or noise) to which it 'most probably' belongs to. The point is how -'most probably' can be interpreted and implemented. +'most probably' can be interpreted and implemented. +Here, the Fisher linear discriminant and the Perceptron were implemented and +described in the following two sections. The results are compared in +@sec:7_results. + ## Fisher linear discriminant + ### The projection direction The Fisher linear discriminant (FLD) is a linear classification model based on dimensionality reduction. It allows to reduce this 2D classification problem into a one-dimensional decision surface. -Consider the case of two classes (in this case the signal and the noise): the -simplest representation of a linear discriminant is obtained by taking a linear -function of a sampled 2D point $x$ so that: - +Consider the case of two classes (in this case signal and noise): the simplest +representation of a linear discriminant is obtained by taking a linear function +$\hat{x}$ of a sampled 2D point $x$ so that: $$ \hat{x} = w^T x $$ -where $w$ is the so-called 'weight vector'. An input point $x$ is commonly -assigned to the first class if $\hat{x} \geqslant w_{th}$ and to the second one -otherwise, where $w_{th}$ is a threshold value somehow defined. -In general, the projection onto one dimension leads to a considerable loss of -information and classes that are well separated in the original 2D space may -become strongly overlapping in one dimension. However, by adjusting the -components of the weight vector, a projection that maximizes the classes -separation can be selected. +where $w$ is the so-called 'weight vector' and $w^T$ stands for its transpose. +An input point $x$ is commonly assigned to the first class if $\hat{x} \geqslant +w_{th}$ and to the second one otherwise, where $w_{th}$ is a threshold value +somehow defined. In general, the projection onto one dimension leads to a +considerable loss of information and classes that are well separated in the +original 2D space may become strongly overlapping in one dimension. However, by +adjusting the components of the weight vector, a projection that maximizes the +classes separation can be selected [@bishop06]. To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class -$C_2$, so that the means $m_1$ and $m_2$ of the two classes are given by: - +$C_2$, so that the means $\mu_1$ and $\mu_2$ of the two classes are given by: $$ - m_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n + \mu_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n \et - m_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n + \mu_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n $$ The simplest measure of the separation of the classes is the separation of the projected class means. This suggests to choose $w$ so as to maximize: - $$ - \hat{m}_2 − \hat{m}_1 = w^T (m_2 − m_1) + \hat{\mu}_2 − \hat{\mu}_1 = w^T (\mu_2 − \mu_1) $$ This expression can be made arbitrarily large simply by increasing the magnitude of $w$. To solve this problem, $w$ can be constrained to have unit length, so that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained -maximization, it can be found that $w \propto (m_2 − m_1)$. - -![The plot on the left shows samples from two classes along with the histograms -resulting from projection onto the line joining the class means: note that -there is considerable overlap in the projected space. The right plot shows the -corresponding projection based on the Fisher linear discriminant, showing the -greatly improved classes separation.](images/7-fisher.png){#fig:overlap} - +maximization, it can be found that $w \propto (\mu_2 − \mu_1)$, meaning that the +line onto the points must be projected is the one joining the class means. There is still a problem with this approach, however, as illustrated in @fig:overlap: the two classes are well separated in the original 2D space but -have considerable overlap when projected onto the line joining their means. +have considerable overlap when projected onto the line joining their means +which maximize their projections distance. + +![The plot on the left shows samples from two classes along with the +histograms resulting fromthe projection onto the line joining the +class means: note that there is considerable overlap in the projected +space. The right plot shows the corresponding projection based on the +Fisher linear discriminant, showing the greatly improved classes +separation. Fifure from [@bishop06]](images/7-fisher.png){#fig:overlap} + The idea to solve it is to maximize a function that will give a large separation between the projected classes means while also giving a small variance within each class, thereby minimizing the class overlap. -The within-classes variance of the transformed data of each class $k$ is given +The within-class variance of the transformed data of each class $k$ is given by: - $$ - s_k^2 = \sum_{n \in C_k} (\hat{x}_n - \hat{m}_k)^2 + \hat{s}_k^2 = \sum_{n \in c_k} (\hat{x}_n - \hat{\mu}_k)^2 $$ -The total within-classes variance for the whole data set can be simply defined -as $s^2 = s_1^2 + s_2^2$. The Fisher criterion is therefore defined to be the -ratio of the between-classes distance to the within-classes variance and is +The total within-class variance for the whole data set is simply defined as +$\hat{s}^2 = \hat{s}_1^2 + \hat{s}_2^2$. The Fisher criterion is defined to +be the ratio of the between-class distance to the within-class variance and is given by: - $$ - J(w) = \frac{(\hat{m}_2 - \hat{m}_1)^2}{s^2} + F(w) = \frac{(\hat{\mu}_2 - \hat{\mu}_1)^2}{\hat{s}^2} $$ -Differentiating $J(w)$ with respect to $w$, it can be found that it is -maximized when: +The dependence on $w$ can be made explicit: +\begin{align*} + (\hat{\mu}_2 - \hat{\mu}_1)^2 &= (w^T \mu_2 - w^T \mu_1)^2 \\ + &= [w^T (\mu_2 - \mu_1)]^2 \\ + &= [w^T (\mu_2 - \mu_1)][w^T (\mu_2 - \mu_1)] \\ + &= [w^T (\mu_2 - \mu_1)][(\mu_2 - \mu_1)^T w] + = w^T M w +\end{align*} +where $M$ is the between-distance matrix. Similarly, as regards the denominator: +\begin{align*} + \hat{s}^2 &= \hat{s}_1^2 + \hat{s}_2^2 = \\ + &= \sum_{n \in c_1} (\hat{x}_n - \hat{\mu}_1)^2 + + \sum_{n \in c_2} (\hat{x}_n - \hat{\mu}_2)^2 + = w^T \Sigma_w w +\end{align*} + +where $\Sigma_w$ is the total within-class covariance matrix: +\begin{align*} + \Sigma_w &= \sum_{n \in c_1} (x_n − \mu_1)(x_n − \mu_1)^T + + \sum_{n \in c_2} (x_n − \mu_2)(x_n − \mu_2)^T \\ + &= \Sigma_1 + \Sigma_2 + = \begin{pmatrix} + \sigma_x^2 & \sigma_{xy} \\ + \sigma_{xy} & \sigma_y^2 + \end{pmatrix}_1 + + \begin{pmatrix} + \sigma_x^2 & \sigma_{xy} \\ + \sigma_{xy} & \sigma_y^2 + \end{pmatrix}_2 +\end{align*} + +Where $\Sigma_1$ and $\Sigma_2$ are the covariance matrix of the two samples. +The Fisher criterion can therefore be rewritten in the form: $$ - w = S_b^{-1} (m_2 - m_1) + F(w) = \frac{w^T M w}{w^T \Sigma_w w} $$ -where $S_b$ is the covariance matrix, given by: - +Differentiating with respect to $w$, it can be found that $F(w)$ is maximized +when: $$ - S_b = S_1 + S_2 + w = \Sigma_w^{-1} (\mu_2 - \mu_1) $$ -where $S_1$ and $S_2$ are the covariance matrix of the two classes, namely: - -$$ -\begin{pmatrix} -\sigma_x^2 & \sigma_{xy} \\ -\sigma_{xy} & \sigma_y^2 -\end{pmatrix} -$$ - -This is not truly a discriminant but rather a specific choice of direction for -projection of the data down to one dimension: the projected data can then be +This is not truly a discriminant but rather a specific choice of the direction +for projection of the data down to one dimension: the projected data can then be used to construct a discriminant by choosing a threshold for the classification. When implemented, the parameters given in @sec:sampling were used to compute -the covariance matrices $S_1$ and $S_2$ of the two classes and their sum $S$. -Then $S$, being a symmetrical and positive-definite matrix, was inverted with -the Cholesky method, already discussed in @sec:MLM. -Lastly, the matrix-vector product was computed with the `gsl_blas_dgemv()` -function provided by GSL. +the covariance matrices and their sum $\Sigma_w$. Then $\Sigma_w$, being a +symmetrical and positive-definite matrix, was inverted with the Cholesky method, +already discussed in @sec:MLM. Lastly, the matrix-vector product was computed +with the `gsl_blas_dgemv()` function provided by GSL. ### The threshold -The cut was fixed by the condition of conditional probability being the same -for each class: - +The threshold $t_{\text{cut}}$ was fixed by the condition of conditional +probability $P(c_k | t_{\text{cut}})$ being the same for both classes $c_k$: $$ t_{\text{cut}} = x \, | \hspace{20pt} \frac{P(c_1 | x)}{P(c_2 | x)} = - \frac{p(x | c_1) \, p(c_1)}{p(x | c_1) \, p(c_2)} = 1 + \frac{P(x | c_1) \, P(c_1)}{P(x | c_1) \, P(c_2)} = 1 $$ -where $p(x | c_k)$ is the probability for point $x$ along the Fisher projection -line of belonging to the class $k$. If the classes are bivariate Gaussian, as -in the present case, then $p(x | c_k)$ is simply given by its projected normal -distribution $\mathscr{G} (\hat{μ}, \hat{S})$. With a bit of math, the solution -is then: - +where $P(x | c_k)$ is the probability for point $x$ along the Fisher projection +line of being sampled according to the class $k$. If each class is a bivariate +Gaussian, as in the present case, then $P(x | c_k)$ is simply given by its +projected normal distribution with mean $\hat{m} = w^T m$ and variance $\hat{s} += w^T S w$, being $S$ the covariance matrix of the class. +With a bit of math, the following solution can be found: $$ - t = \frac{b}{a} + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}} + t_{\text{cut}} = \frac{b}{a} + + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}} $$ where: - - $a = \hat{S}_1^2 - \hat{S}_2^2$ - - $b = \hat{m}_2 \, \hat{S}_1^2 - \hat{M}_1 \, \hat{S}_2^2$ - - $c = \hat{M}_2^2 \, \hat{S}_1^2 - \hat{M}_1^2 \, \hat{S}_2^2 - - 2 \, \hat{S}_1^2 \, \hat{S}_2^2 \, \ln(\alpha)$ - - $\alpha = p(c_1) / p(c_2)$ - -The ratio of the prior probability $\alpha$ was computed as: + - $a = \hat{s}_1^2 - \hat{s}_2^2$ + - $b = \hat{\mu}_2 \, \hat{s}_1^2 - \hat{\mu}_1 \, \hat{s}_2^2$ + - $c = \hat{\mu}_2^2 \, \hat{s}_1^2 - \hat{\mu}_1^2 \, \hat{s}_2^2 + - 2 \, \hat{s}_1^2 \, \hat{s}_2^2 \, \ln(\alpha)$ + - $\alpha = P(c_1) / P(c_2)$ +The ratio of the prior probabilities $\alpha$ is simply given by: $$ \alpha = \frac{N_s}{N_n} $$ The projection of the points was accomplished by the use of the function -`gsl_blas_ddot()`, which computed a dot product between two vectors, which in -this case were the weight vector and the position of the point to be projected. +`gsl_blas_ddot()`, which computes the element wise product between two vectors. + +Results obtained for the same samples in @fig:points are shown in +@fig:fisher_proj. The weight vector and the treshold were found to be: +$$ + w = (0.707, 0.707) \et + t_{\text{cut}} = 1.323 +$$