ex-7: revised and typo-fixed

In addition, the folder ex-7/iters was created in order to plot the results
of the Perceptron method as a function of the iterations parameter.
This commit is contained in:
Giù Marcer 2020-05-24 12:01:36 +02:00 committed by rnhmjoj
parent 90ab77b676
commit 7ad45a829e
11 changed files with 332 additions and 162 deletions

View File

@ -39,9 +39,9 @@ gsl_vector* normal_mean(struct par *p) {
* between the two classes. * between the two classes.
* The projection vector w is given by * The projection vector w is given by
* *
* w = Sw¹ (μ - μ) * w = Σ_w¹ (μ - μ)
* *
* where Sw = Σ + Σ is the so-called within-class * where Σ_w = Σ + Σ is the so-called within-class
* covariance matrix. * covariance matrix.
*/ */
gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) { gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
@ -54,10 +54,11 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
gsl_vector *mu2 = normal_mean(&c2->p); gsl_vector *mu2 = normal_mean(&c2->p);
/* Compute the inverse of the within-class /* Compute the inverse of the within-class
* covariance Sw¹. * covariance Σ_w¹.
* Note: by definition Σ is symmetrical and * Note: by definition Σ is symmetrical and
* positive-definite, so Cholesky is appropriate. * positive-definite, so Cholesky is appropriate.
*/ */
gsl_matrix_add(cov1, cov2); gsl_matrix_add(cov1, cov2);
gsl_linalg_cholesky_decomp(cov1); gsl_linalg_cholesky_decomp(cov1);
gsl_linalg_cholesky_invert(cov1); gsl_linalg_cholesky_invert(cov1);
@ -67,7 +68,7 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
gsl_vector_memcpy(diff, mu2); gsl_vector_memcpy(diff, mu2);
gsl_vector_sub(diff, mu1); gsl_vector_sub(diff, mu1);
/* Finally multiply diff by Sw. /* Finally multiply diff by Σ_w.
* This uses the rather low-level CBLAS * This uses the rather low-level CBLAS
* functions gsl_blas_dgemv: * functions gsl_blas_dgemv:
* *

25
ex-7/iters/iter.py Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env python
import numpy as np
import matplotlib.pyplot as plt
iter, w_x, w_y, b = np.loadtxt('ex-7/iters/iters.txt')
plt.figure(figsize=(5, 4))
plt.rcParams['font.size'] = 8
plt.subplot(211)
plt.title('weight vector', loc='right')
plt.plot(iter, w, color='#92182b')
plt.ylabel('w')
plt.xlabel('N')
plt.subplot(212)
plt.title('bias', loc='right')
plt.plot(iter, b, color='gray')
plt.ylabel('b')
plt.xlabel('N')
plt.tight_layout()
plt.show()

11
ex-7/iters/iters.txt Normal file
View File

@ -0,0 +1,11 @@
#iters w_x w_y b b
1 0.427 0.904 1.749
2 0.566 0.824 1.445
3 0.654 0.756 1.213
4 0.654 0.756 1.213
5 0.654 0.756 1.213
7 0.654 0.756 1.213
6 0.654 0.756 1.213
8 0.654 0.756 1.213
9 0.654 0.756 1.213
10 0.654 0.756 1.213

View File

@ -95,7 +95,7 @@ int main(int argc, char **argv) {
* dataset to get an approximate * dataset to get an approximate
* solution in `iter` iterations. * solution in `iter` iterations.
*/ */
fputs("# Perceptron \n\n", stderr); // fputs("# Perceptron \n\n", stderr);
w = percep_train(signal, noise, opts.iter, &cut); w = percep_train(signal, noise, opts.iter, &cut);
} }
else { else {
@ -107,20 +107,21 @@ int main(int argc, char **argv) {
/* Print the results of the method /* Print the results of the method
* selected: weights and threshold. * selected: weights and threshold.
*/ */
fprintf(stderr, "\n* i: %d\n", opts.iter);
fprintf(stderr, "* w: [%.3f, %.3f]\n", fprintf(stderr, "* w: [%.3f, %.3f]\n",
gsl_vector_get(w, 0), gsl_vector_get(w, 0),
gsl_vector_get(w, 1)); gsl_vector_get(w, 1));
fprintf(stderr, "* cut: %.3f\n", cut); fprintf(stderr, "* cut: %.3f\n", cut);
gsl_vector_fprintf(stdout, w, "%g"); // gsl_vector_fprintf(stdout, w, "%g");
printf("%f\n", cut); // printf("%f\n", cut);
/* Print data to stdout for plotting. /* Print data to stdout for plotting.
* Note: we print the sizes to be able * Note: we print the sizes to be able
* to set apart the two matrices. * to set apart the two matrices.
*/ */
printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2); // printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
gsl_matrix_fprintf(stdout, signal->data, "%g"); // gsl_matrix_fprintf(stdout, signal->data, "%g");
gsl_matrix_fprintf(stdout, noise->data, "%g"); // gsl_matrix_fprintf(stdout, noise->data, "%g");
// free memory // free memory
gsl_rng_free(r); gsl_rng_free(r);

View File

@ -7,7 +7,6 @@ def line(x, y, **args):
'''line between two points x,y''' '''line between two points x,y'''
plot([x[0], y[0]], [x[1], y[1]], **args) plot([x[0], y[0]], [x[1], y[1]], **args)
rcParams['font.size'] = 12
w = loadtxt(sys.stdin, max_rows=2) w = loadtxt(sys.stdin, max_rows=2)
v = array([[0, -1], [1, 0]]) @ w v = array([[0, -1], [1, 0]]) @ w
cut = float(input()) cut = float(input())
@ -16,7 +15,11 @@ n, m, d = map(int, input().split())
data = loadtxt(sys.stdin).reshape(n + m, d) data = loadtxt(sys.stdin).reshape(n + m, d)
signal, noise = data[:n].T, data[n:].T signal, noise = data[:n].T, data[n:].T
plt.figure(figsize=(3, 3))
rcParams['font.size'] = 8
figure() figure()
figure(figsize=(3, 3))
rcParams['font.size'] = 8
subplot(aspect='equal') subplot(aspect='equal')
scatter(*signal, edgecolor='xkcd:charcoal', scatter(*signal, edgecolor='xkcd:charcoal',
c='xkcd:dark yellow', label='signal') c='xkcd:dark yellow', label='signal')
@ -24,18 +27,22 @@ scatter(*noise, edgecolor='xkcd:charcoal',
c='xkcd:pale purple', label='noise') c='xkcd:pale purple', label='noise')
line(-20*w, 20*w, c='xkcd:midnight blue', label='projection') line(-20*w, 20*w, c='xkcd:midnight blue', label='projection')
line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut') line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut')
xlabel('x')
ylabel('y')
xlim(-1.5, 8) xlim(-1.5, 8)
ylim(-1.5, 8) ylim(-1.5, 8)
legend() legend()
tight_layout() tight_layout()
savefig('notes/images/7-fisher-plane.pdf')
figure() plt.figure(figsize=(3, 3))
rcParams['font.size'] = 8
sig_proj = np.dot(w, signal) sig_proj = np.dot(w, signal)
noise_proj = np.dot(w, noise) noise_proj = np.dot(w, noise)
hist(sig_proj, color='xkcd:dark yellow', label='signal') hist(sig_proj, color='xkcd:dark yellow', label='signal')
hist(noise_proj, color='xkcd:pale purple', label='noise') hist(noise_proj, color='xkcd:pale purple', label='noise')
axvline(cut, c='xkcd:scarlet') axvline(cut, c='xkcd:scarlet', label='cut')
xlabel('projection line')
legend() legend()
tight_layout() tight_layout()
savefig('notes/images/7-fisher-proj.pdf')
show()

View File

@ -140,17 +140,32 @@
@book{hecht02, @book{hecht02,
title={Optics}, title={Optics},
author={Eugene Hecht},
year={2002}, year={2002},
publisher={Pearson}, publisher={Pearson}
author={Eugene Hecht}
} }
@article{lucy74, @article{lucy74,
title={An iterative technique for the rectification of observed title={An iterative technique for the rectification of observed
distributions}, distributions},
author={Lucy, Leon B}, author={Lucy, Leon B.},
journal={The astronomical journal}, journal={The astronomical journal},
volume={79}, volume={79},
pages={745}, pages={745},
year={1974} year={1974}
} }
@book{bishop06,
title={Pattern Recognition and Machine Learning},
author={Bishop, Christopher M.},
year={2006},
pages={186 -- 189},
publisher={Springer}
}
@techreport{novikoff63,
title={On convergence proofs for perceptrons},
author={Novikoff, Albert B},
year={1963},
institution={Stanford Researhc INST Menlo Park CA}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -2,9 +2,8 @@
## Generating points according to Gaussian distributions {#sec:sampling} ## Generating points according to Gaussian distributions {#sec:sampling}
The first task of exercise 7 is to generate two sets of 2D points $(x, y)$ Two sets of 2D points $(x, y)$ - signal and noise - is to be generated according
according to two bivariate Gaussian distributions with parameters: to two bivariate Gaussian distributions with parameters:
$$ $$
\text{signal} \quad \text{signal} \quad
\begin{cases} \begin{cases}
@ -21,262 +20,361 @@ $$
\end{cases} \end{cases}
$$ $$
where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ are the standard where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ for the standard
deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate
correlation, hence: correlation, namely:
$$ $$
\sigma_{xy} = \rho \sigma_x \sigma_y \sigma_{xy} = \rho \sigma_x \sigma_y
$$ $$
where $\sigma_{xy}$ is the covariance of $x$ and $y$. where $\sigma_{xy}$ is the covariance of $x$ and $y$.
In the code, default settings are $N_s = 800$ points for the signal and $N_n = In the code, default settings are $N_s = 800$ points for the signal and $N_n =
1000$ points for the noise but can be changed from the command-line. Both 1000$ points for the noise but can be customized from the input command-line.
samples were handled as matrices of dimension $n$ x 2, where $n$ is the number Both samples were handled as matrices of dimension $n$ x 2, where $n$ is the
of points in the sample. The library `gsl_matrix` provided by GSL was employed number of points in the sample. The library `gsl_matrix` provided by GSL was
for this purpose and the function `gsl_ran_bivariate_gaussian()` was used for employed for this purpose and the function `gsl_ran_bivariate_gaussian()` was
generating the points. used for generating the points.
An example of the two samples is shown in @fig:points. An example of the two samples is shown in @fig:points.
![Example of points sorted according to two Gaussian with ![Example of points sampled according to the two Gaussian distributions
the given parameters. Noise points in pink and signal points with the given parameters.](images/7-points.pdf){#fig:points}
in yellow.](images/7-points.pdf){#fig:points}
Assuming not to know how the points were generated, a model of classification Assuming not to know how the points were generated, a model of classification
must then be implemented in order to assign each point to the right class is then to be implemented in order to assign each point to the right class
(signal or noise) to which it 'most probably' belongs to. The point is how (signal or noise) to which it 'most probably' belongs to. The point is how
'most probably' can be interpreted and implemented. 'most probably' can be interpreted and implemented.
Here, the Fisher linear discriminant and the Perceptron were implemented and
described in the following two sections. The results are compared in
@sec:7_results.
## Fisher linear discriminant ## Fisher linear discriminant
### The projection direction ### The projection direction
The Fisher linear discriminant (FLD) is a linear classification model based on The Fisher linear discriminant (FLD) is a linear classification model based on
dimensionality reduction. It allows to reduce this 2D classification problem dimensionality reduction. It allows to reduce this 2D classification problem
into a one-dimensional decision surface. into a one-dimensional decision surface.
Consider the case of two classes (in this case the signal and the noise): the Consider the case of two classes (in this case signal and noise): the simplest
simplest representation of a linear discriminant is obtained by taking a linear representation of a linear discriminant is obtained by taking a linear function
function of a sampled 2D point $x$ so that: $\hat{x}$ of a sampled 2D point $x$ so that:
$$ $$
\hat{x} = w^T x \hat{x} = w^T x
$$ $$
where $w$ is the so-called 'weight vector'. An input point $x$ is commonly where $w$ is the so-called 'weight vector' and $w^T$ stands for its transpose.
assigned to the first class if $\hat{x} \geqslant w_{th}$ and to the second one An input point $x$ is commonly assigned to the first class if $\hat{x} \geqslant
otherwise, where $w_{th}$ is a threshold value somehow defined. w_{th}$ and to the second one otherwise, where $w_{th}$ is a threshold value
In general, the projection onto one dimension leads to a considerable loss of somehow defined. In general, the projection onto one dimension leads to a
information and classes that are well separated in the original 2D space may considerable loss of information and classes that are well separated in the
become strongly overlapping in one dimension. However, by adjusting the original 2D space may become strongly overlapping in one dimension. However, by
components of the weight vector, a projection that maximizes the classes adjusting the components of the weight vector, a projection that maximizes the
separation can be selected. classes separation can be selected [@bishop06].
To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class
$C_2$, so that the means $m_1$ and $m_2$ of the two classes are given by: $C_2$, so that the means $\mu_1$ and $\mu_2$ of the two classes are given by:
$$ $$
m_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n \mu_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
\et \et
m_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n \mu_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
$$ $$
The simplest measure of the separation of the classes is the separation of the The simplest measure of the separation of the classes is the separation of the
projected class means. This suggests to choose $w$ so as to maximize: projected class means. This suggests to choose $w$ so as to maximize:
$$ $$
\hat{m}_2 \hat{m}_1 = w^T (m_2 m_1) \hat{\mu}_2 \hat{\mu}_1 = w^T (\mu_2 \mu_1)
$$ $$
This expression can be made arbitrarily large simply by increasing the magnitude This expression can be made arbitrarily large simply by increasing the magnitude
of $w$. To solve this problem, $w$ can be constrained to have unit length, so of $w$. To solve this problem, $w$ can be constrained to have unit length, so
that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained
maximization, it can be found that $w \propto (m_2 m_1)$. maximization, it can be found that $w \propto (\mu_2 \mu_1)$, meaning that the
line onto the points must be projected is the one joining the class means.
![The plot on the left shows samples from two classes along with the histograms
resulting from projection onto the line joining the class means: note that
there is considerable overlap in the projected space. The right plot shows the
corresponding projection based on the Fisher linear discriminant, showing the
greatly improved classes separation.](images/7-fisher.png){#fig:overlap}
There is still a problem with this approach, however, as illustrated in There is still a problem with this approach, however, as illustrated in
@fig:overlap: the two classes are well separated in the original 2D space but @fig:overlap: the two classes are well separated in the original 2D space but
have considerable overlap when projected onto the line joining their means. have considerable overlap when projected onto the line joining their means
which maximize their projections distance.
![The plot on the left shows samples from two classes along with the
histograms resulting fromthe projection onto the line joining the
class means: note that there is considerable overlap in the projected
space. The right plot shows the corresponding projection based on the
Fisher linear discriminant, showing the greatly improved classes
separation. Fifure from [@bishop06]](images/7-fisher.png){#fig:overlap}
The idea to solve it is to maximize a function that will give a large separation The idea to solve it is to maximize a function that will give a large separation
between the projected classes means while also giving a small variance within between the projected classes means while also giving a small variance within
each class, thereby minimizing the class overlap. each class, thereby minimizing the class overlap.
The within-classes variance of the transformed data of each class $k$ is given The within-class variance of the transformed data of each class $k$ is given
by: by:
$$ $$
s_k^2 = \sum_{n \in C_k} (\hat{x}_n - \hat{m}_k)^2 \hat{s}_k^2 = \sum_{n \in c_k} (\hat{x}_n - \hat{\mu}_k)^2
$$ $$
The total within-classes variance for the whole data set can be simply defined The total within-class variance for the whole data set is simply defined as
as $s^2 = s_1^2 + s_2^2$. The Fisher criterion is therefore defined to be the $\hat{s}^2 = \hat{s}_1^2 + \hat{s}_2^2$. The Fisher criterion is defined to
ratio of the between-classes distance to the within-classes variance and is be the ratio of the between-class distance to the within-class variance and is
given by: given by:
$$ $$
J(w) = \frac{(\hat{m}_2 - \hat{m}_1)^2}{s^2} F(w) = \frac{(\hat{\mu}_2 - \hat{\mu}_1)^2}{\hat{s}^2}
$$ $$
Differentiating $J(w)$ with respect to $w$, it can be found that it is The dependence on $w$ can be made explicit:
maximized when: \begin{align*}
(\hat{\mu}_2 - \hat{\mu}_1)^2 &= (w^T \mu_2 - w^T \mu_1)^2 \\
&= [w^T (\mu_2 - \mu_1)]^2 \\
&= [w^T (\mu_2 - \mu_1)][w^T (\mu_2 - \mu_1)] \\
&= [w^T (\mu_2 - \mu_1)][(\mu_2 - \mu_1)^T w]
= w^T M w
\end{align*}
$$ where $M$ is the between-distance matrix. Similarly, as regards the denominator:
w = S_b^{-1} (m_2 - m_1) \begin{align*}
$$ \hat{s}^2 &= \hat{s}_1^2 + \hat{s}_2^2 = \\
&= \sum_{n \in c_1} (\hat{x}_n - \hat{\mu}_1)^2
+ \sum_{n \in c_2} (\hat{x}_n - \hat{\mu}_2)^2
= w^T \Sigma_w w
\end{align*}
where $S_b$ is the covariance matrix, given by: where $\Sigma_w$ is the total within-class covariance matrix:
\begin{align*}
$$ \Sigma_w &= \sum_{n \in c_1} (x_n \mu_1)(x_n \mu_1)^T
S_b = S_1 + S_2 + \sum_{n \in c_2} (x_n \mu_2)(x_n \mu_2)^T \\
$$ &= \Sigma_1 + \Sigma_2
= \begin{pmatrix}
where $S_1$ and $S_2$ are the covariance matrix of the two classes, namely: \sigma_x^2 & \sigma_{xy} \\
\sigma_{xy} & \sigma_y^2
$$ \end{pmatrix}_1 +
\begin{pmatrix} \begin{pmatrix}
\sigma_x^2 & \sigma_{xy} \\ \sigma_x^2 & \sigma_{xy} \\
\sigma_{xy} & \sigma_y^2 \sigma_{xy} & \sigma_y^2
\end{pmatrix} \end{pmatrix}_2
\end{align*}
Where $\Sigma_1$ and $\Sigma_2$ are the covariance matrix of the two samples.
The Fisher criterion can therefore be rewritten in the form:
$$
F(w) = \frac{w^T M w}{w^T \Sigma_w w}
$$ $$
This is not truly a discriminant but rather a specific choice of direction for Differentiating with respect to $w$, it can be found that $F(w)$ is maximized
projection of the data down to one dimension: the projected data can then be when:
$$
w = \Sigma_w^{-1} (\mu_2 - \mu_1)
$$
This is not truly a discriminant but rather a specific choice of the direction
for projection of the data down to one dimension: the projected data can then be
used to construct a discriminant by choosing a threshold for the used to construct a discriminant by choosing a threshold for the
classification. classification.
When implemented, the parameters given in @sec:sampling were used to compute When implemented, the parameters given in @sec:sampling were used to compute
the covariance matrices $S_1$ and $S_2$ of the two classes and their sum $S$. the covariance matrices and their sum $\Sigma_w$. Then $\Sigma_w$, being a
Then $S$, being a symmetrical and positive-definite matrix, was inverted with symmetrical and positive-definite matrix, was inverted with the Cholesky method,
the Cholesky method, already discussed in @sec:MLM. already discussed in @sec:MLM. Lastly, the matrix-vector product was computed
Lastly, the matrix-vector product was computed with the `gsl_blas_dgemv()` with the `gsl_blas_dgemv()` function provided by GSL.
function provided by GSL.
### The threshold ### The threshold
The cut was fixed by the condition of conditional probability being the same The threshold $t_{\text{cut}}$ was fixed by the condition of conditional
for each class: probability $P(c_k | t_{\text{cut}})$ being the same for both classes $c_k$:
$$ $$
t_{\text{cut}} = x \, | \hspace{20pt} t_{\text{cut}} = x \, | \hspace{20pt}
\frac{P(c_1 | x)}{P(c_2 | x)} = \frac{P(c_1 | x)}{P(c_2 | x)} =
\frac{p(x | c_1) \, p(c_1)}{p(x | c_1) \, p(c_2)} = 1 \frac{P(x | c_1) \, P(c_1)}{P(x | c_1) \, P(c_2)} = 1
$$ $$
where $p(x | c_k)$ is the probability for point $x$ along the Fisher projection where $P(x | c_k)$ is the probability for point $x$ along the Fisher projection
line of belonging to the class $k$. If the classes are bivariate Gaussian, as line of being sampled according to the class $k$. If each class is a bivariate
in the present case, then $p(x | c_k)$ is simply given by its projected normal Gaussian, as in the present case, then $P(x | c_k)$ is simply given by its
distribution $\mathscr{G} (\hat{μ}, \hat{S})$. With a bit of math, the solution projected normal distribution with mean $\hat{m} = w^T m$ and variance $\hat{s}
is then: = w^T S w$, being $S$ the covariance matrix of the class.
With a bit of math, the following solution can be found:
$$ $$
t = \frac{b}{a} + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}} t_{\text{cut}} = \frac{b}{a}
+ \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
$$ $$
where: where:
- $a = \hat{S}_1^2 - \hat{S}_2^2$ - $a = \hat{s}_1^2 - \hat{s}_2^2$
- $b = \hat{m}_2 \, \hat{S}_1^2 - \hat{M}_1 \, \hat{S}_2^2$ - $b = \hat{\mu}_2 \, \hat{s}_1^2 - \hat{\mu}_1 \, \hat{s}_2^2$
- $c = \hat{M}_2^2 \, \hat{S}_1^2 - \hat{M}_1^2 \, \hat{S}_2^2 - $c = \hat{\mu}_2^2 \, \hat{s}_1^2 - \hat{\mu}_1^2 \, \hat{s}_2^2
- 2 \, \hat{S}_1^2 \, \hat{S}_2^2 \, \ln(\alpha)$ - 2 \, \hat{s}_1^2 \, \hat{s}_2^2 \, \ln(\alpha)$
- $\alpha = p(c_1) / p(c_2)$ - $\alpha = P(c_1) / P(c_2)$
The ratio of the prior probability $\alpha$ was computed as:
The ratio of the prior probabilities $\alpha$ is simply given by:
$$ $$
\alpha = \frac{N_s}{N_n} \alpha = \frac{N_s}{N_n}
$$ $$
The projection of the points was accomplished by the use of the function The projection of the points was accomplished by the use of the function
`gsl_blas_ddot()`, which computed a dot product between two vectors, which in `gsl_blas_ddot()`, which computes the element wise product between two vectors.
this case were the weight vector and the position of the point to be projected.
Results obtained for the same samples in @fig:points are shown in
@fig:fisher_proj. The weight vector and the treshold were found to be:
$$
w = (0.707, 0.707) \et
t_{\text{cut}} = 1.323
$$
<div id="fig:fisher_proj"> <div id="fig:fisher_proj">
![View from above of the samples.](images/7-fisher-plane.pdf){height=5.7cm} ![View of the samples in the plane.](images/7-fisher-plane.pdf)
![Gaussian of the samples on the projection ![View of the samples projections onto the projection
line.](images/7-fisher-proj.pdf){height=5.7cm} line.](images/7-fisher-proj.pdf)
Aerial and lateral views of the projection direction, in blue, and the cut, in Aerial and lateral views of the samples. Projection line in blu and cut in red.
red.
</div> </div>
Results obtained for the same sample in @fig:points are shown in Since the vector $w$ turned out to be parallel with the line joining the means
@fig:fisher_proj. The weight vector $w$ was found to be: of the two classes (reminded to be $(0, 0)$ and $(4, 4)$), one can be mislead
and assume that the inverse of the total covariance matrix $\Sigma_w$ is
isotropic, namely proportional to the unit matrix.
That's not true. In this special sample, the vector joining the means turns out
to be an eigenvector of the covariance matrix $\Sigma_w^{-1}$. In fact: since
$\sigma_x = \sigma_y$ for both signal and noise:
$$ $$
w = (0.707, 0.707) \Sigma_1 = \begin{pmatrix}
\sigma_x^2 & \sigma_{xy} \\
\sigma_{xy} & \sigma_x^2
\end{pmatrix}_1
\et
\Sigma_2 = \begin{pmatrix}
\sigma_x^2 & \sigma_{xy} \\
\sigma_{xy} & \sigma_x^2
\end{pmatrix}_2
$$ $$
and $t_{\text{cut}}$ is 1.323 far from the origin of the axes. Hence, as can be $\Sigma_w$ takes the form:
seen, the vector $w$ turned out to be parallel to the line joining the means of $$
the two classes (reminded to be $(0, 0)$ and $(4, 4)$) which means that the \Sigma_w = \begin{pmatrix}
total covariance matrix $S$ is isotropic, proportional to the unit matrix. A & B \\
B & A
\end{pmatrix}
$$
Which can be easily inverted by Gaussian elimination:
\begin{align*}
\begin{pmatrix}
A & B & \vline & 1 & 0 \\
B & A & \vline & 0 & 1 \\
\end{pmatrix} &\longrightarrow
\begin{pmatrix}
A - B & 0 & \vline & 1 - B & - B \\
0 & A - B & \vline & - B & 1 - B \\
\end{pmatrix} \\ &\longrightarrow
\begin{pmatrix}
1 & 0 & \vline & (1 - B)/(A - B) & - B/(A - B) \\
0 & 1 & \vline & - B/(A - B) & (1 - B)/(A - B) \\
\end{pmatrix}
\end{align*}
Hence:
$$
\Sigma_w^{-1} = \begin{pmatrix}
\tilde{A} & \tilde{B} \\
\tilde{B} & \tilde{A}
\end{pmatrix}
$$
Thus, $\Sigma_w$ and $\Sigma_w^{-1}$ share the same eigenvectors $v_1$ and
$v_2$:
$$
v_1 = \begin{pmatrix}
1 \\
-1
\end{pmatrix} \et
v_2 = \begin{pmatrix}
1 \\
1
\end{pmatrix}
$$
and the vector joining the means is clearly a multiple of $v_2$, causing $w$ to
be a multiple of it.
## Perceptron ## Perceptron
In machine learning, the perceptron is an algorithm for supervised learning of In machine learning, the perceptron is an algorithm for supervised learning of
linear binary classifiers. linear binary classifiers.
Supervised learning is the machine learning task of inferring a function $f$ Supervised learning is the machine learning task of inferring a function $f$
that maps an input $x$ to an output $f(x)$ based on a set of training that maps an input $x$ to an output $f(x)$ based on a set of training
input-output pairs. Each example is a pair consisting of an input object and an input-output pairs, where each pair consists of an input object and an output
output value. The inferred function can be used for mapping new examples. The value. The inferred function can be used for mapping new examples: the algorithm
algorithm will be generalized to correctly determine the class labels for unseen is generalized to correctly determine the class labels for unseen instances.
instances.
The aim is to determine the bias $b$ such that the threshold function $f(x)$:
The aim of the perceptron algorithm is to determine the weight vector $w$ and
bias $b$ such that the so-called 'threshold function' $f(x)$ returns a binary
value: it is expected to return 1 for signal points and 0 for noise points:
$$ $$
f(x) = x \cdot w + b \hspace{20pt} f(x) = \theta(w^T \cdot x + b)
\begin{cases}
\geqslant 0 \incase x \in \text{signal} \\
< 0 \incase x \in \text{noise}
\end{cases}
$$ {#eq:perc} $$ {#eq:perc}
The training was performed as follow. Initial values were set as $w = (0,0)$ and where $\theta$ is the Heaviside theta function.
$b = 0$. From these, the perceptron starts to improve their estimations. The The training was performed using the generated sample as training set. From an
sample was passed point by point into a iterative procedure a grand total of initial guess for $w$ and $b$ (which were set to be all null in the code), the
$N_c$ calls: each time, the projection $w \cdot x$ of the point was computed perceptron starts to improve their estimations. The training set is passed point
and then the variable $\Delta$ was defined as: by point into a iterative procedure a customizable number $N$ of times: for
every point, the output of $f(x)$ is computed. Afterwards, the variable
$\Delta$, which is defined as:
$$ $$
\Delta = r * (e - \theta (f(x)) \Delta = r [e - f(x)]
$$ $$
where: where:
- $r$ is the learning rate of the perceptron: it is between 0 and 1. The - $r \in [0, 1]$ is the learning rate of the perceptron: the larger $r$, the
larger $r$, the more volatile the weight changes. In the code, it was set more volatile the weight changes. In the code it was arbitrarily set $r =
$r = 0.8$; 0.8$;
- $e$ is the expected value, namely 0 if $x$ is noise and 1 if it is signal; - $e$ is the expected output value, namely 1 if $x$ is signal and 0 if it is
- $\theta$ is the Heaviside theta function; noise;
- $o$ is the observed value of $f(x)$ defined in @eq:perc.
Then $b$ and $w$ must be updated as: is used to update $b$ and $w$:
$$ $$
b \to b + \Delta b \to b + \Delta
\et \et
w \to w + x \Delta w \to w + \Delta x
$$ $$
<div id="fig:percep_proj"> To see how it works, consider the four possible situations:
![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
![Gaussian of the samples on the projection
line.](images/7-percep-proj.pdf){height=5.7cm}
Aerial and lateral views of the projection direction, in blue, and the cut, in - $e = 1 \quad \wedge \quad f(x) = 1 \quad \dot \vee \quad e = 0 \quad \wedge
red. \quad f(x) = 0 \quad \Longrightarrow \quad \Delta = 0$
</div> the current estimations work properly: $b$ and $w$ do not need to be updated;
- $e = 1 \quad \wedge \quad f(x) = 0 \quad \Longrightarrow \quad
\Delta = 1$
the current $b$ and $w$ underestimate the correct output: they must be
increased;
- $e = 0 \quad \wedge \quad f(x) = 1 \quad \Longrightarrow \quad
\Delta = -1$
the current $b$ and $w$ overestimate the correct output: they must be
decreased.
It can be shown that this method converges to the coveted function. Whilst the $b$ updating is obvious, as regarsd $w$ the following consideration
As stated in the previous section, the weight vector must finally be normalized. may help clarify. Consider the case with $e = 0 \quad \wedge \quad f(x) = 1
\quad \Longrightarrow \quad \Delta = -1$:
$$
w^T \cdot x \to (w^T + \Delta x^T) \cdot x
= w^T \cdot x + \Delta |x|^2
= w^T \cdot x - |x|^2 \leq w^T \cdot x
$$
With $N_c = 5$, the values of $w$ and $t_{\text{cut}}$ level off up to the third Similarly for the case with $e = 1$ and $f(x) = 0$.
As far as convergence is concerned, the perceptron will never get to the state
with all the input points classified correctly if the training set is not
linearly separable, meaning that the signal cannot be separated from the noise
by a line in the plane. In this case, no approximate solutions will be gradually
approached. On the other hand, if the training set is linearly separable, it can
be shown that this method converges to the coveted function [@novikoff63].
As in the previous section, once found, the weight vector is to be normalized.
With $N = 5$ iterations, the values of $w$ and $t_{\text{cut}}$ level off up to the third
digit. The following results were obtained: digit. The following results were obtained:
$$ $$
@ -287,7 +385,16 @@ where, once again, $t_{\text{cut}}$ is computed from the origin of the axes. In
this case, the projection line does not lies along the mains of the two this case, the projection line does not lies along the mains of the two
samples. Plots in @fig:percep_proj. samples. Plots in @fig:percep_proj.
## Efficiency test <div id="fig:percep_proj">
![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
![Gaussian of the samples on the projection
line.](images/7-percep-proj.pdf){height=5.7cm}
Aerial and lateral views of the projection direction, in blue, and the cut, in
red.
</div>
## Efficiency test {#sec:7_results}
A program was implemented to check the validity of the two A program was implemented to check the validity of the two
classification methods. classification methods.

View File

@ -1,3 +1,6 @@
- riscrivere il readme di: 1, 2, 6 - riscrivere il readme di: 1, 2, 6
- rileggere il 7 - rileggere il 7
- completare il 2 - completare il 2
- riscrivere il 2
On the lambert W function, formula 4.19 Corless