diff --git a/ex-7/fisher.c b/ex-7/fisher.c index 8216595..25d8279 100644 --- a/ex-7/fisher.c +++ b/ex-7/fisher.c @@ -39,9 +39,9 @@ gsl_vector* normal_mean(struct par *p) { * between the two classes. * The projection vector w is given by * - * w = Sw⁻¹ (μ₂ - μ₁) + * w = Σ_w⁻¹ (μ₂ - μ₁) * - * where Sw = Σ₁ + Σ₂ is the so-called within-class + * where Σ_w = Σ₁ + Σ₂ is the so-called within-class * covariance matrix. */ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { @@ -54,10 +54,11 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { gsl_vector *mu2 = normal_mean(&c2->p); /* Compute the inverse of the within-class - * covariance Sw⁻¹. + * covariance Σ_w⁻¹. * Note: by definition Σ is symmetrical and * positive-definite, so Cholesky is appropriate. */ + gsl_matrix_add(cov1, cov2); gsl_linalg_cholesky_decomp(cov1); gsl_linalg_cholesky_invert(cov1); @@ -67,7 +68,7 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { gsl_vector_memcpy(diff, mu2); gsl_vector_sub(diff, mu1); - /* Finally multiply diff by Sw. + /* Finally multiply diff by Σ_w. * This uses the rather low-level CBLAS * functions gsl_blas_dgemv: * diff --git a/ex-7/iters/iter.py b/ex-7/iters/iter.py new file mode 100644 index 0000000..44e0aae --- /dev/null +++ b/ex-7/iters/iter.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +import numpy as np +import matplotlib.pyplot as plt + + +iter, w_x, w_y, b = np.loadtxt('ex-7/iters/iters.txt') + +plt.figure(figsize=(5, 4)) +plt.rcParams['font.size'] = 8 + +plt.subplot(211) +plt.title('weight vector', loc='right') +plt.plot(iter, w, color='#92182b') +plt.ylabel('w') +plt.xlabel('N') + +plt.subplot(212) +plt.title('bias', loc='right') +plt.plot(iter, b, color='gray') +plt.ylabel('b') +plt.xlabel('N') + +plt.tight_layout() +plt.show() diff --git a/ex-7/iters/iters.txt b/ex-7/iters/iters.txt new file mode 100644 index 0000000..e9db0ec --- /dev/null +++ b/ex-7/iters/iters.txt @@ -0,0 +1,11 @@ +#iters w_x w_y b b + 1 0.427 0.904 1.749 + 2 0.566 0.824 1.445 + 3 0.654 0.756 1.213 + 4 0.654 0.756 1.213 + 5 0.654 0.756 1.213 + 7 0.654 0.756 1.213 + 6 0.654 0.756 1.213 + 8 0.654 0.756 1.213 + 9 0.654 0.756 1.213 +10 0.654 0.756 1.213 diff --git a/ex-7/main.c b/ex-7/main.c index 67f146e..7bc5b76 100644 --- a/ex-7/main.c +++ b/ex-7/main.c @@ -95,7 +95,7 @@ int main(int argc, char **argv) { * dataset to get an approximate * solution in `iter` iterations. */ - fputs("# Perceptron \n\n", stderr); + // fputs("# Perceptron \n\n", stderr); w = percep_train(signal, noise, opts.iter, &cut); } else { @@ -107,20 +107,21 @@ int main(int argc, char **argv) { /* Print the results of the method * selected: weights and threshold. */ + fprintf(stderr, "\n* i: %d\n", opts.iter); fprintf(stderr, "* w: [%.3f, %.3f]\n", gsl_vector_get(w, 0), gsl_vector_get(w, 1)); fprintf(stderr, "* cut: %.3f\n", cut); - gsl_vector_fprintf(stdout, w, "%g"); - printf("%f\n", cut); +// gsl_vector_fprintf(stdout, w, "%g"); +// printf("%f\n", cut); /* Print data to stdout for plotting. * Note: we print the sizes to be able * to set apart the two matrices. */ - printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2); - gsl_matrix_fprintf(stdout, signal->data, "%g"); - gsl_matrix_fprintf(stdout, noise->data, "%g"); +// printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2); +// gsl_matrix_fprintf(stdout, signal->data, "%g"); +// gsl_matrix_fprintf(stdout, noise->data, "%g"); // free memory gsl_rng_free(r); diff --git a/ex-7/plot.py b/ex-7/plot.py index 5cfbdeb..04ea7e7 100755 --- a/ex-7/plot.py +++ b/ex-7/plot.py @@ -7,7 +7,6 @@ def line(x, y, **args): '''line between two points x,y''' plot([x[0], y[0]], [x[1], y[1]], **args) -rcParams['font.size'] = 12 w = loadtxt(sys.stdin, max_rows=2) v = array([[0, -1], [1, 0]]) @ w cut = float(input()) @@ -16,7 +15,11 @@ n, m, d = map(int, input().split()) data = loadtxt(sys.stdin).reshape(n + m, d) signal, noise = data[:n].T, data[n:].T +plt.figure(figsize=(3, 3)) +rcParams['font.size'] = 8 figure() +figure(figsize=(3, 3)) +rcParams['font.size'] = 8 subplot(aspect='equal') scatter(*signal, edgecolor='xkcd:charcoal', c='xkcd:dark yellow', label='signal') @@ -24,18 +27,22 @@ scatter(*noise, edgecolor='xkcd:charcoal', c='xkcd:pale purple', label='noise') line(-20*w, 20*w, c='xkcd:midnight blue', label='projection') line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut') +xlabel('x') +ylabel('y') xlim(-1.5, 8) ylim(-1.5, 8) legend() tight_layout() +savefig('notes/images/7-fisher-plane.pdf') -figure() +plt.figure(figsize=(3, 3)) +rcParams['font.size'] = 8 sig_proj = np.dot(w, signal) noise_proj = np.dot(w, noise) hist(sig_proj, color='xkcd:dark yellow', label='signal') hist(noise_proj, color='xkcd:pale purple', label='noise') -axvline(cut, c='xkcd:scarlet') +axvline(cut, c='xkcd:scarlet', label='cut') +xlabel('projection line') legend() tight_layout() - -show() +savefig('notes/images/7-fisher-proj.pdf') diff --git a/notes/docs/bibliography.bib b/notes/docs/bibliography.bib index 50df816..987787c 100644 --- a/notes/docs/bibliography.bib +++ b/notes/docs/bibliography.bib @@ -140,17 +140,32 @@ @book{hecht02, title={Optics}, + author={Eugene Hecht}, year={2002}, - publisher={Pearson}, - author={Eugene Hecht} + publisher={Pearson} } @article{lucy74, title={An iterative technique for the rectification of observed distributions}, - author={Lucy, Leon B}, + author={Lucy, Leon B.}, journal={The astronomical journal}, volume={79}, pages={745}, year={1974} } + +@book{bishop06, + title={Pattern Recognition and Machine Learning}, + author={Bishop, Christopher M.}, + year={2006}, + pages={186 -- 189}, + publisher={Springer} +} + +@techreport{novikoff63, + title={On convergence proofs for perceptrons}, + author={Novikoff, Albert B}, + year={1963}, + institution={Stanford Researhc INST Menlo Park CA} +} diff --git a/notes/images/7-fisher-plane.pdf b/notes/images/7-fisher-plane.pdf index c74d619..1f31fad 100644 Binary files a/notes/images/7-fisher-plane.pdf and b/notes/images/7-fisher-plane.pdf differ diff --git a/notes/images/7-fisher-proj.pdf b/notes/images/7-fisher-proj.pdf index 21ddf21..b141ae4 100644 Binary files a/notes/images/7-fisher-proj.pdf and b/notes/images/7-fisher-proj.pdf differ diff --git a/notes/images/7-points.pdf b/notes/images/7-points.pdf index 6d8b3f5..1486857 100644 Binary files a/notes/images/7-points.pdf and b/notes/images/7-points.pdf differ diff --git a/notes/sections/7.md b/notes/sections/7.md index 7b6e415..801ed05 100644 --- a/notes/sections/7.md +++ b/notes/sections/7.md @@ -2,9 +2,8 @@ ## Generating points according to Gaussian distributions {#sec:sampling} -The first task of exercise 7 is to generate two sets of 2D points $(x, y)$ -according to two bivariate Gaussian distributions with parameters: - +Two sets of 2D points $(x, y)$ - signal and noise - is to be generated according +to two bivariate Gaussian distributions with parameters: $$ \text{signal} \quad \begin{cases} @@ -21,262 +20,361 @@ $$ \end{cases} $$ -where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ are the standard +where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ for the standard deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate -correlation, hence: - +correlation, namely: $$ \sigma_{xy} = \rho \sigma_x \sigma_y $$ where $\sigma_{xy}$ is the covariance of $x$ and $y$. In the code, default settings are $N_s = 800$ points for the signal and $N_n = -1000$ points for the noise but can be changed from the command-line. Both -samples were handled as matrices of dimension $n$ x 2, where $n$ is the number -of points in the sample. The library `gsl_matrix` provided by GSL was employed -for this purpose and the function `gsl_ran_bivariate_gaussian()` was used for -generating the points. +1000$ points for the noise but can be customized from the input command-line. +Both samples were handled as matrices of dimension $n$ x 2, where $n$ is the +number of points in the sample. The library `gsl_matrix` provided by GSL was +employed for this purpose and the function `gsl_ran_bivariate_gaussian()` was +used for generating the points. An example of the two samples is shown in @fig:points. -![Example of points sorted according to two Gaussian with -the given parameters. Noise points in pink and signal points -in yellow.](images/7-points.pdf){#fig:points} +![Example of points sampled according to the two Gaussian distributions +with the given parameters.](images/7-points.pdf){#fig:points} Assuming not to know how the points were generated, a model of classification -must then be implemented in order to assign each point to the right class +is then to be implemented in order to assign each point to the right class (signal or noise) to which it 'most probably' belongs to. The point is how -'most probably' can be interpreted and implemented. +'most probably' can be interpreted and implemented. +Here, the Fisher linear discriminant and the Perceptron were implemented and +described in the following two sections. The results are compared in +@sec:7_results. + ## Fisher linear discriminant + ### The projection direction The Fisher linear discriminant (FLD) is a linear classification model based on dimensionality reduction. It allows to reduce this 2D classification problem into a one-dimensional decision surface. -Consider the case of two classes (in this case the signal and the noise): the -simplest representation of a linear discriminant is obtained by taking a linear -function of a sampled 2D point $x$ so that: - +Consider the case of two classes (in this case signal and noise): the simplest +representation of a linear discriminant is obtained by taking a linear function +$\hat{x}$ of a sampled 2D point $x$ so that: $$ \hat{x} = w^T x $$ -where $w$ is the so-called 'weight vector'. An input point $x$ is commonly -assigned to the first class if $\hat{x} \geqslant w_{th}$ and to the second one -otherwise, where $w_{th}$ is a threshold value somehow defined. -In general, the projection onto one dimension leads to a considerable loss of -information and classes that are well separated in the original 2D space may -become strongly overlapping in one dimension. However, by adjusting the -components of the weight vector, a projection that maximizes the classes -separation can be selected. +where $w$ is the so-called 'weight vector' and $w^T$ stands for its transpose. +An input point $x$ is commonly assigned to the first class if $\hat{x} \geqslant +w_{th}$ and to the second one otherwise, where $w_{th}$ is a threshold value +somehow defined. In general, the projection onto one dimension leads to a +considerable loss of information and classes that are well separated in the +original 2D space may become strongly overlapping in one dimension. However, by +adjusting the components of the weight vector, a projection that maximizes the +classes separation can be selected [@bishop06]. To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class -$C_2$, so that the means $m_1$ and $m_2$ of the two classes are given by: - +$C_2$, so that the means $\mu_1$ and $\mu_2$ of the two classes are given by: $$ - m_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n + \mu_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n \et - m_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n + \mu_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n $$ The simplest measure of the separation of the classes is the separation of the projected class means. This suggests to choose $w$ so as to maximize: - $$ - \hat{m}_2 − \hat{m}_1 = w^T (m_2 − m_1) + \hat{\mu}_2 − \hat{\mu}_1 = w^T (\mu_2 − \mu_1) $$ This expression can be made arbitrarily large simply by increasing the magnitude of $w$. To solve this problem, $w$ can be constrained to have unit length, so that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained -maximization, it can be found that $w \propto (m_2 − m_1)$. - -![The plot on the left shows samples from two classes along with the histograms -resulting from projection onto the line joining the class means: note that -there is considerable overlap in the projected space. The right plot shows the -corresponding projection based on the Fisher linear discriminant, showing the -greatly improved classes separation.](images/7-fisher.png){#fig:overlap} - +maximization, it can be found that $w \propto (\mu_2 − \mu_1)$, meaning that the +line onto the points must be projected is the one joining the class means. There is still a problem with this approach, however, as illustrated in @fig:overlap: the two classes are well separated in the original 2D space but -have considerable overlap when projected onto the line joining their means. +have considerable overlap when projected onto the line joining their means +which maximize their projections distance. + +![The plot on the left shows samples from two classes along with the +histograms resulting fromthe projection onto the line joining the +class means: note that there is considerable overlap in the projected +space. The right plot shows the corresponding projection based on the +Fisher linear discriminant, showing the greatly improved classes +separation. Fifure from [@bishop06]](images/7-fisher.png){#fig:overlap} + The idea to solve it is to maximize a function that will give a large separation between the projected classes means while also giving a small variance within each class, thereby minimizing the class overlap. -The within-classes variance of the transformed data of each class $k$ is given +The within-class variance of the transformed data of each class $k$ is given by: - $$ - s_k^2 = \sum_{n \in C_k} (\hat{x}_n - \hat{m}_k)^2 + \hat{s}_k^2 = \sum_{n \in c_k} (\hat{x}_n - \hat{\mu}_k)^2 $$ -The total within-classes variance for the whole data set can be simply defined -as $s^2 = s_1^2 + s_2^2$. The Fisher criterion is therefore defined to be the -ratio of the between-classes distance to the within-classes variance and is +The total within-class variance for the whole data set is simply defined as +$\hat{s}^2 = \hat{s}_1^2 + \hat{s}_2^2$. The Fisher criterion is defined to +be the ratio of the between-class distance to the within-class variance and is given by: - $$ - J(w) = \frac{(\hat{m}_2 - \hat{m}_1)^2}{s^2} + F(w) = \frac{(\hat{\mu}_2 - \hat{\mu}_1)^2}{\hat{s}^2} $$ -Differentiating $J(w)$ with respect to $w$, it can be found that it is -maximized when: +The dependence on $w$ can be made explicit: +\begin{align*} + (\hat{\mu}_2 - \hat{\mu}_1)^2 &= (w^T \mu_2 - w^T \mu_1)^2 \\ + &= [w^T (\mu_2 - \mu_1)]^2 \\ + &= [w^T (\mu_2 - \mu_1)][w^T (\mu_2 - \mu_1)] \\ + &= [w^T (\mu_2 - \mu_1)][(\mu_2 - \mu_1)^T w] + = w^T M w +\end{align*} +where $M$ is the between-distance matrix. Similarly, as regards the denominator: +\begin{align*} + \hat{s}^2 &= \hat{s}_1^2 + \hat{s}_2^2 = \\ + &= \sum_{n \in c_1} (\hat{x}_n - \hat{\mu}_1)^2 + + \sum_{n \in c_2} (\hat{x}_n - \hat{\mu}_2)^2 + = w^T \Sigma_w w +\end{align*} + +where $\Sigma_w$ is the total within-class covariance matrix: +\begin{align*} + \Sigma_w &= \sum_{n \in c_1} (x_n − \mu_1)(x_n − \mu_1)^T + + \sum_{n \in c_2} (x_n − \mu_2)(x_n − \mu_2)^T \\ + &= \Sigma_1 + \Sigma_2 + = \begin{pmatrix} + \sigma_x^2 & \sigma_{xy} \\ + \sigma_{xy} & \sigma_y^2 + \end{pmatrix}_1 + + \begin{pmatrix} + \sigma_x^2 & \sigma_{xy} \\ + \sigma_{xy} & \sigma_y^2 + \end{pmatrix}_2 +\end{align*} + +Where $\Sigma_1$ and $\Sigma_2$ are the covariance matrix of the two samples. +The Fisher criterion can therefore be rewritten in the form: $$ - w = S_b^{-1} (m_2 - m_1) + F(w) = \frac{w^T M w}{w^T \Sigma_w w} $$ -where $S_b$ is the covariance matrix, given by: - +Differentiating with respect to $w$, it can be found that $F(w)$ is maximized +when: $$ - S_b = S_1 + S_2 + w = \Sigma_w^{-1} (\mu_2 - \mu_1) $$ -where $S_1$ and $S_2$ are the covariance matrix of the two classes, namely: - -$$ -\begin{pmatrix} -\sigma_x^2 & \sigma_{xy} \\ -\sigma_{xy} & \sigma_y^2 -\end{pmatrix} -$$ - -This is not truly a discriminant but rather a specific choice of direction for -projection of the data down to one dimension: the projected data can then be +This is not truly a discriminant but rather a specific choice of the direction +for projection of the data down to one dimension: the projected data can then be used to construct a discriminant by choosing a threshold for the classification. When implemented, the parameters given in @sec:sampling were used to compute -the covariance matrices $S_1$ and $S_2$ of the two classes and their sum $S$. -Then $S$, being a symmetrical and positive-definite matrix, was inverted with -the Cholesky method, already discussed in @sec:MLM. -Lastly, the matrix-vector product was computed with the `gsl_blas_dgemv()` -function provided by GSL. +the covariance matrices and their sum $\Sigma_w$. Then $\Sigma_w$, being a +symmetrical and positive-definite matrix, was inverted with the Cholesky method, +already discussed in @sec:MLM. Lastly, the matrix-vector product was computed +with the `gsl_blas_dgemv()` function provided by GSL. ### The threshold -The cut was fixed by the condition of conditional probability being the same -for each class: - +The threshold $t_{\text{cut}}$ was fixed by the condition of conditional +probability $P(c_k | t_{\text{cut}})$ being the same for both classes $c_k$: $$ t_{\text{cut}} = x \, | \hspace{20pt} \frac{P(c_1 | x)}{P(c_2 | x)} = - \frac{p(x | c_1) \, p(c_1)}{p(x | c_1) \, p(c_2)} = 1 + \frac{P(x | c_1) \, P(c_1)}{P(x | c_1) \, P(c_2)} = 1 $$ -where $p(x | c_k)$ is the probability for point $x$ along the Fisher projection -line of belonging to the class $k$. If the classes are bivariate Gaussian, as -in the present case, then $p(x | c_k)$ is simply given by its projected normal -distribution $\mathscr{G} (\hat{μ}, \hat{S})$. With a bit of math, the solution -is then: - +where $P(x | c_k)$ is the probability for point $x$ along the Fisher projection +line of being sampled according to the class $k$. If each class is a bivariate +Gaussian, as in the present case, then $P(x | c_k)$ is simply given by its +projected normal distribution with mean $\hat{m} = w^T m$ and variance $\hat{s} += w^T S w$, being $S$ the covariance matrix of the class. +With a bit of math, the following solution can be found: $$ - t = \frac{b}{a} + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}} + t_{\text{cut}} = \frac{b}{a} + + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}} $$ where: - - $a = \hat{S}_1^2 - \hat{S}_2^2$ - - $b = \hat{m}_2 \, \hat{S}_1^2 - \hat{M}_1 \, \hat{S}_2^2$ - - $c = \hat{M}_2^2 \, \hat{S}_1^2 - \hat{M}_1^2 \, \hat{S}_2^2 - - 2 \, \hat{S}_1^2 \, \hat{S}_2^2 \, \ln(\alpha)$ - - $\alpha = p(c_1) / p(c_2)$ - -The ratio of the prior probability $\alpha$ was computed as: + - $a = \hat{s}_1^2 - \hat{s}_2^2$ + - $b = \hat{\mu}_2 \, \hat{s}_1^2 - \hat{\mu}_1 \, \hat{s}_2^2$ + - $c = \hat{\mu}_2^2 \, \hat{s}_1^2 - \hat{\mu}_1^2 \, \hat{s}_2^2 + - 2 \, \hat{s}_1^2 \, \hat{s}_2^2 \, \ln(\alpha)$ + - $\alpha = P(c_1) / P(c_2)$ +The ratio of the prior probabilities $\alpha$ is simply given by: $$ \alpha = \frac{N_s}{N_n} $$ The projection of the points was accomplished by the use of the function -`gsl_blas_ddot()`, which computed a dot product between two vectors, which in -this case were the weight vector and the position of the point to be projected. +`gsl_blas_ddot()`, which computes the element wise product between two vectors. + +Results obtained for the same samples in @fig:points are shown in +@fig:fisher_proj. The weight vector and the treshold were found to be: +$$ + w = (0.707, 0.707) \et + t_{\text{cut}} = 1.323 +$$
-![View from above of the samples.](images/7-fisher-plane.pdf){height=5.7cm} -![Gaussian of the samples on the projection - line.](images/7-fisher-proj.pdf){height=5.7cm} +![View of the samples in the plane.](images/7-fisher-plane.pdf) +![View of the samples projections onto the projection + line.](images/7-fisher-proj.pdf) -Aerial and lateral views of the projection direction, in blue, and the cut, in -red. +Aerial and lateral views of the samples. Projection line in blu and cut in red.
-Results obtained for the same sample in @fig:points are shown in -@fig:fisher_proj. The weight vector $w$ was found to be: - +Since the vector $w$ turned out to be parallel with the line joining the means +of the two classes (reminded to be $(0, 0)$ and $(4, 4)$), one can be mislead +and assume that the inverse of the total covariance matrix $\Sigma_w$ is +isotropic, namely proportional to the unit matrix. +That's not true. In this special sample, the vector joining the means turns out +to be an eigenvector of the covariance matrix $\Sigma_w^{-1}$. In fact: since +$\sigma_x = \sigma_y$ for both signal and noise: $$ - w = (0.707, 0.707) + \Sigma_1 = \begin{pmatrix} + \sigma_x^2 & \sigma_{xy} \\ + \sigma_{xy} & \sigma_x^2 + \end{pmatrix}_1 + \et + \Sigma_2 = \begin{pmatrix} + \sigma_x^2 & \sigma_{xy} \\ + \sigma_{xy} & \sigma_x^2 + \end{pmatrix}_2 $$ -and $t_{\text{cut}}$ is 1.323 far from the origin of the axes. Hence, as can be -seen, the vector $w$ turned out to be parallel to the line joining the means of -the two classes (reminded to be $(0, 0)$ and $(4, 4)$) which means that the -total covariance matrix $S$ is isotropic, proportional to the unit matrix. +$\Sigma_w$ takes the form: +$$ + \Sigma_w = \begin{pmatrix} + A & B \\ + B & A + \end{pmatrix} +$$ + +Which can be easily inverted by Gaussian elimination: +\begin{align*} + \begin{pmatrix} + A & B & \vline & 1 & 0 \\ + B & A & \vline & 0 & 1 \\ + \end{pmatrix} &\longrightarrow + \begin{pmatrix} + A - B & 0 & \vline & 1 - B & - B \\ + 0 & A - B & \vline & - B & 1 - B \\ + \end{pmatrix} \\ &\longrightarrow + \begin{pmatrix} + 1 & 0 & \vline & (1 - B)/(A - B) & - B/(A - B) \\ + 0 & 1 & \vline & - B/(A - B) & (1 - B)/(A - B) \\ + \end{pmatrix} +\end{align*} + +Hence: +$$ + \Sigma_w^{-1} = \begin{pmatrix} + \tilde{A} & \tilde{B} \\ + \tilde{B} & \tilde{A} + \end{pmatrix} +$$ + +Thus, $\Sigma_w$ and $\Sigma_w^{-1}$ share the same eigenvectors $v_1$ and +$v_2$: +$$ + v_1 = \begin{pmatrix} + 1 \\ + -1 + \end{pmatrix} \et + v_2 = \begin{pmatrix} + 1 \\ + 1 + \end{pmatrix} +$$ + +and the vector joining the means is clearly a multiple of $v_2$, causing $w$ to +be a multiple of it. ## Perceptron In machine learning, the perceptron is an algorithm for supervised learning of -linear binary classifiers. +linear binary classifiers. + Supervised learning is the machine learning task of inferring a function $f$ that maps an input $x$ to an output $f(x)$ based on a set of training -input-output pairs. Each example is a pair consisting of an input object and an -output value. The inferred function can be used for mapping new examples. The -algorithm will be generalized to correctly determine the class labels for unseen -instances. - -The aim is to determine the bias $b$ such that the threshold function $f(x)$: +input-output pairs, where each pair consists of an input object and an output +value. The inferred function can be used for mapping new examples: the algorithm +is generalized to correctly determine the class labels for unseen instances. +The aim of the perceptron algorithm is to determine the weight vector $w$ and +bias $b$ such that the so-called 'threshold function' $f(x)$ returns a binary +value: it is expected to return 1 for signal points and 0 for noise points: $$ - f(x) = x \cdot w + b \hspace{20pt} - \begin{cases} - \geqslant 0 \incase x \in \text{signal} \\ - < 0 \incase x \in \text{noise} - \end{cases} + f(x) = \theta(w^T \cdot x + b) $$ {#eq:perc} -The training was performed as follow. Initial values were set as $w = (0,0)$ and -$b = 0$. From these, the perceptron starts to improve their estimations. The -sample was passed point by point into a iterative procedure a grand total of -$N_c$ calls: each time, the projection $w \cdot x$ of the point was computed -and then the variable $\Delta$ was defined as: - +where $\theta$ is the Heaviside theta function. +The training was performed using the generated sample as training set. From an +initial guess for $w$ and $b$ (which were set to be all null in the code), the +perceptron starts to improve their estimations. The training set is passed point +by point into a iterative procedure a customizable number $N$ of times: for +every point, the output of $f(x)$ is computed. Afterwards, the variable +$\Delta$, which is defined as: $$ - \Delta = r * (e - \theta (f(x)) + \Delta = r [e - f(x)] $$ where: - - $r$ is the learning rate of the perceptron: it is between 0 and 1. The - larger $r$, the more volatile the weight changes. In the code, it was set - $r = 0.8$; - - $e$ is the expected value, namely 0 if $x$ is noise and 1 if it is signal; - - $\theta$ is the Heaviside theta function; - - $o$ is the observed value of $f(x)$ defined in @eq:perc. + - $r \in [0, 1]$ is the learning rate of the perceptron: the larger $r$, the + more volatile the weight changes. In the code it was arbitrarily set $r = + 0.8$; + - $e$ is the expected output value, namely 1 if $x$ is signal and 0 if it is + noise; -Then $b$ and $w$ must be updated as: +is used to update $b$ and $w$: $$ b \to b + \Delta \et - w \to w + x \Delta + w \to w + \Delta x $$ -
-![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm} -![Gaussian of the samples on the projection - line.](images/7-percep-proj.pdf){height=5.7cm} +To see how it works, consider the four possible situations: -Aerial and lateral views of the projection direction, in blue, and the cut, in -red. -
+ - $e = 1 \quad \wedge \quad f(x) = 1 \quad \dot \vee \quad e = 0 \quad \wedge + \quad f(x) = 0 \quad \Longrightarrow \quad \Delta = 0$ + the current estimations work properly: $b$ and $w$ do not need to be updated; + - $e = 1 \quad \wedge \quad f(x) = 0 \quad \Longrightarrow \quad + \Delta = 1$ + the current $b$ and $w$ underestimate the correct output: they must be + increased; + - $e = 0 \quad \wedge \quad f(x) = 1 \quad \Longrightarrow \quad + \Delta = -1$ + the current $b$ and $w$ overestimate the correct output: they must be + decreased. -It can be shown that this method converges to the coveted function. -As stated in the previous section, the weight vector must finally be normalized. +Whilst the $b$ updating is obvious, as regarsd $w$ the following consideration +may help clarify. Consider the case with $e = 0 \quad \wedge \quad f(x) = 1 +\quad \Longrightarrow \quad \Delta = -1$: +$$ + w^T \cdot x \to (w^T + \Delta x^T) \cdot x + = w^T \cdot x + \Delta |x|^2 + = w^T \cdot x - |x|^2 \leq w^T \cdot x +$$ -With $N_c = 5$, the values of $w$ and $t_{\text{cut}}$ level off up to the third +Similarly for the case with $e = 1$ and $f(x) = 0$. + +As far as convergence is concerned, the perceptron will never get to the state +with all the input points classified correctly if the training set is not +linearly separable, meaning that the signal cannot be separated from the noise +by a line in the plane. In this case, no approximate solutions will be gradually +approached. On the other hand, if the training set is linearly separable, it can +be shown that this method converges to the coveted function [@novikoff63]. +As in the previous section, once found, the weight vector is to be normalized. + +With $N = 5$ iterations, the values of $w$ and $t_{\text{cut}}$ level off up to the third digit. The following results were obtained: $$ @@ -287,7 +385,16 @@ where, once again, $t_{\text{cut}}$ is computed from the origin of the axes. In this case, the projection line does not lies along the mains of the two samples. Plots in @fig:percep_proj. -## Efficiency test +
+![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm} +![Gaussian of the samples on the projection + line.](images/7-percep-proj.pdf){height=5.7cm} + +Aerial and lateral views of the projection direction, in blue, and the cut, in +red. +
+ +## Efficiency test {#sec:7_results} A program was implemented to check the validity of the two classification methods. diff --git a/notes/todo b/notes/todo index e1c458c..3e0e592 100644 --- a/notes/todo +++ b/notes/todo @@ -1,3 +1,6 @@ - riscrivere il readme di: 1, 2, 6 - rileggere il 7 - completare il 2 +- riscrivere il 2 + +On the lambert W function, formula 4.19 Corless