ex-7: revised and typo-fixed
In addition, the folder ex-7/iters was created in order to plot the results of the Perceptron method as a function of the iterations parameter.
This commit is contained in:
parent
90ab77b676
commit
7ad45a829e
@ -39,9 +39,9 @@ gsl_vector* normal_mean(struct par *p) {
|
||||
* between the two classes.
|
||||
* The projection vector w is given by
|
||||
*
|
||||
* w = Sw⁻¹ (μ₂ - μ₁)
|
||||
* w = Σ_w⁻¹ (μ₂ - μ₁)
|
||||
*
|
||||
* where Sw = Σ₁ + Σ₂ is the so-called within-class
|
||||
* where Σ_w = Σ₁ + Σ₂ is the so-called within-class
|
||||
* covariance matrix.
|
||||
*/
|
||||
gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
||||
@ -54,10 +54,11 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
||||
gsl_vector *mu2 = normal_mean(&c2->p);
|
||||
|
||||
/* Compute the inverse of the within-class
|
||||
* covariance Sw⁻¹.
|
||||
* covariance Σ_w⁻¹.
|
||||
* Note: by definition Σ is symmetrical and
|
||||
* positive-definite, so Cholesky is appropriate.
|
||||
*/
|
||||
|
||||
gsl_matrix_add(cov1, cov2);
|
||||
gsl_linalg_cholesky_decomp(cov1);
|
||||
gsl_linalg_cholesky_invert(cov1);
|
||||
@ -67,7 +68,7 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
||||
gsl_vector_memcpy(diff, mu2);
|
||||
gsl_vector_sub(diff, mu1);
|
||||
|
||||
/* Finally multiply diff by Sw.
|
||||
/* Finally multiply diff by Σ_w.
|
||||
* This uses the rather low-level CBLAS
|
||||
* functions gsl_blas_dgemv:
|
||||
*
|
||||
|
25
ex-7/iters/iter.py
Normal file
25
ex-7/iters/iter.py
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
iter, w_x, w_y, b = np.loadtxt('ex-7/iters/iters.txt')
|
||||
|
||||
plt.figure(figsize=(5, 4))
|
||||
plt.rcParams['font.size'] = 8
|
||||
|
||||
plt.subplot(211)
|
||||
plt.title('weight vector', loc='right')
|
||||
plt.plot(iter, w, color='#92182b')
|
||||
plt.ylabel('w')
|
||||
plt.xlabel('N')
|
||||
|
||||
plt.subplot(212)
|
||||
plt.title('bias', loc='right')
|
||||
plt.plot(iter, b, color='gray')
|
||||
plt.ylabel('b')
|
||||
plt.xlabel('N')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
11
ex-7/iters/iters.txt
Normal file
11
ex-7/iters/iters.txt
Normal file
@ -0,0 +1,11 @@
|
||||
#iters w_x w_y b b
|
||||
1 0.427 0.904 1.749
|
||||
2 0.566 0.824 1.445
|
||||
3 0.654 0.756 1.213
|
||||
4 0.654 0.756 1.213
|
||||
5 0.654 0.756 1.213
|
||||
7 0.654 0.756 1.213
|
||||
6 0.654 0.756 1.213
|
||||
8 0.654 0.756 1.213
|
||||
9 0.654 0.756 1.213
|
||||
10 0.654 0.756 1.213
|
13
ex-7/main.c
13
ex-7/main.c
@ -95,7 +95,7 @@ int main(int argc, char **argv) {
|
||||
* dataset to get an approximate
|
||||
* solution in `iter` iterations.
|
||||
*/
|
||||
fputs("# Perceptron \n\n", stderr);
|
||||
// fputs("# Perceptron \n\n", stderr);
|
||||
w = percep_train(signal, noise, opts.iter, &cut);
|
||||
}
|
||||
else {
|
||||
@ -107,20 +107,21 @@ int main(int argc, char **argv) {
|
||||
/* Print the results of the method
|
||||
* selected: weights and threshold.
|
||||
*/
|
||||
fprintf(stderr, "\n* i: %d\n", opts.iter);
|
||||
fprintf(stderr, "* w: [%.3f, %.3f]\n",
|
||||
gsl_vector_get(w, 0),
|
||||
gsl_vector_get(w, 1));
|
||||
fprintf(stderr, "* cut: %.3f\n", cut);
|
||||
gsl_vector_fprintf(stdout, w, "%g");
|
||||
printf("%f\n", cut);
|
||||
// gsl_vector_fprintf(stdout, w, "%g");
|
||||
// printf("%f\n", cut);
|
||||
|
||||
/* Print data to stdout for plotting.
|
||||
* Note: we print the sizes to be able
|
||||
* to set apart the two matrices.
|
||||
*/
|
||||
printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
|
||||
gsl_matrix_fprintf(stdout, signal->data, "%g");
|
||||
gsl_matrix_fprintf(stdout, noise->data, "%g");
|
||||
// printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
|
||||
// gsl_matrix_fprintf(stdout, signal->data, "%g");
|
||||
// gsl_matrix_fprintf(stdout, noise->data, "%g");
|
||||
|
||||
// free memory
|
||||
gsl_rng_free(r);
|
||||
|
17
ex-7/plot.py
17
ex-7/plot.py
@ -7,7 +7,6 @@ def line(x, y, **args):
|
||||
'''line between two points x,y'''
|
||||
plot([x[0], y[0]], [x[1], y[1]], **args)
|
||||
|
||||
rcParams['font.size'] = 12
|
||||
w = loadtxt(sys.stdin, max_rows=2)
|
||||
v = array([[0, -1], [1, 0]]) @ w
|
||||
cut = float(input())
|
||||
@ -16,7 +15,11 @@ n, m, d = map(int, input().split())
|
||||
data = loadtxt(sys.stdin).reshape(n + m, d)
|
||||
signal, noise = data[:n].T, data[n:].T
|
||||
|
||||
plt.figure(figsize=(3, 3))
|
||||
rcParams['font.size'] = 8
|
||||
figure()
|
||||
figure(figsize=(3, 3))
|
||||
rcParams['font.size'] = 8
|
||||
subplot(aspect='equal')
|
||||
scatter(*signal, edgecolor='xkcd:charcoal',
|
||||
c='xkcd:dark yellow', label='signal')
|
||||
@ -24,18 +27,22 @@ scatter(*noise, edgecolor='xkcd:charcoal',
|
||||
c='xkcd:pale purple', label='noise')
|
||||
line(-20*w, 20*w, c='xkcd:midnight blue', label='projection')
|
||||
line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut')
|
||||
xlabel('x')
|
||||
ylabel('y')
|
||||
xlim(-1.5, 8)
|
||||
ylim(-1.5, 8)
|
||||
legend()
|
||||
tight_layout()
|
||||
savefig('notes/images/7-fisher-plane.pdf')
|
||||
|
||||
figure()
|
||||
plt.figure(figsize=(3, 3))
|
||||
rcParams['font.size'] = 8
|
||||
sig_proj = np.dot(w, signal)
|
||||
noise_proj = np.dot(w, noise)
|
||||
hist(sig_proj, color='xkcd:dark yellow', label='signal')
|
||||
hist(noise_proj, color='xkcd:pale purple', label='noise')
|
||||
axvline(cut, c='xkcd:scarlet')
|
||||
axvline(cut, c='xkcd:scarlet', label='cut')
|
||||
xlabel('projection line')
|
||||
legend()
|
||||
tight_layout()
|
||||
|
||||
show()
|
||||
savefig('notes/images/7-fisher-proj.pdf')
|
||||
|
@ -140,17 +140,32 @@
|
||||
|
||||
@book{hecht02,
|
||||
title={Optics},
|
||||
author={Eugene Hecht},
|
||||
year={2002},
|
||||
publisher={Pearson},
|
||||
author={Eugene Hecht}
|
||||
publisher={Pearson}
|
||||
}
|
||||
|
||||
@article{lucy74,
|
||||
title={An iterative technique for the rectification of observed
|
||||
distributions},
|
||||
author={Lucy, Leon B},
|
||||
author={Lucy, Leon B.},
|
||||
journal={The astronomical journal},
|
||||
volume={79},
|
||||
pages={745},
|
||||
year={1974}
|
||||
}
|
||||
|
||||
@book{bishop06,
|
||||
title={Pattern Recognition and Machine Learning},
|
||||
author={Bishop, Christopher M.},
|
||||
year={2006},
|
||||
pages={186 -- 189},
|
||||
publisher={Springer}
|
||||
}
|
||||
|
||||
@techreport{novikoff63,
|
||||
title={On convergence proofs for perceptrons},
|
||||
author={Novikoff, Albert B},
|
||||
year={1963},
|
||||
institution={Stanford Researhc INST Menlo Park CA}
|
||||
}
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -2,9 +2,8 @@
|
||||
|
||||
## Generating points according to Gaussian distributions {#sec:sampling}
|
||||
|
||||
The first task of exercise 7 is to generate two sets of 2D points $(x, y)$
|
||||
according to two bivariate Gaussian distributions with parameters:
|
||||
|
||||
Two sets of 2D points $(x, y)$ - signal and noise - is to be generated according
|
||||
to two bivariate Gaussian distributions with parameters:
|
||||
$$
|
||||
\text{signal} \quad
|
||||
\begin{cases}
|
||||
@ -21,262 +20,361 @@ $$
|
||||
\end{cases}
|
||||
$$
|
||||
|
||||
where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ are the standard
|
||||
where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ for the standard
|
||||
deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate
|
||||
correlation, hence:
|
||||
|
||||
correlation, namely:
|
||||
$$
|
||||
\sigma_{xy} = \rho \sigma_x \sigma_y
|
||||
$$
|
||||
|
||||
where $\sigma_{xy}$ is the covariance of $x$ and $y$.
|
||||
In the code, default settings are $N_s = 800$ points for the signal and $N_n =
|
||||
1000$ points for the noise but can be changed from the command-line. Both
|
||||
samples were handled as matrices of dimension $n$ x 2, where $n$ is the number
|
||||
of points in the sample. The library `gsl_matrix` provided by GSL was employed
|
||||
for this purpose and the function `gsl_ran_bivariate_gaussian()` was used for
|
||||
generating the points.
|
||||
1000$ points for the noise but can be customized from the input command-line.
|
||||
Both samples were handled as matrices of dimension $n$ x 2, where $n$ is the
|
||||
number of points in the sample. The library `gsl_matrix` provided by GSL was
|
||||
employed for this purpose and the function `gsl_ran_bivariate_gaussian()` was
|
||||
used for generating the points.
|
||||
An example of the two samples is shown in @fig:points.
|
||||
|
||||
![Example of points sorted according to two Gaussian with
|
||||
the given parameters. Noise points in pink and signal points
|
||||
in yellow.](images/7-points.pdf){#fig:points}
|
||||
![Example of points sampled according to the two Gaussian distributions
|
||||
with the given parameters.](images/7-points.pdf){#fig:points}
|
||||
|
||||
Assuming not to know how the points were generated, a model of classification
|
||||
must then be implemented in order to assign each point to the right class
|
||||
is then to be implemented in order to assign each point to the right class
|
||||
(signal or noise) to which it 'most probably' belongs to. The point is how
|
||||
'most probably' can be interpreted and implemented.
|
||||
Here, the Fisher linear discriminant and the Perceptron were implemented and
|
||||
described in the following two sections. The results are compared in
|
||||
@sec:7_results.
|
||||
|
||||
|
||||
## Fisher linear discriminant
|
||||
|
||||
|
||||
### The projection direction
|
||||
|
||||
The Fisher linear discriminant (FLD) is a linear classification model based on
|
||||
dimensionality reduction. It allows to reduce this 2D classification problem
|
||||
into a one-dimensional decision surface.
|
||||
|
||||
Consider the case of two classes (in this case the signal and the noise): the
|
||||
simplest representation of a linear discriminant is obtained by taking a linear
|
||||
function of a sampled 2D point $x$ so that:
|
||||
|
||||
Consider the case of two classes (in this case signal and noise): the simplest
|
||||
representation of a linear discriminant is obtained by taking a linear function
|
||||
$\hat{x}$ of a sampled 2D point $x$ so that:
|
||||
$$
|
||||
\hat{x} = w^T x
|
||||
$$
|
||||
|
||||
where $w$ is the so-called 'weight vector'. An input point $x$ is commonly
|
||||
assigned to the first class if $\hat{x} \geqslant w_{th}$ and to the second one
|
||||
otherwise, where $w_{th}$ is a threshold value somehow defined.
|
||||
In general, the projection onto one dimension leads to a considerable loss of
|
||||
information and classes that are well separated in the original 2D space may
|
||||
become strongly overlapping in one dimension. However, by adjusting the
|
||||
components of the weight vector, a projection that maximizes the classes
|
||||
separation can be selected.
|
||||
where $w$ is the so-called 'weight vector' and $w^T$ stands for its transpose.
|
||||
An input point $x$ is commonly assigned to the first class if $\hat{x} \geqslant
|
||||
w_{th}$ and to the second one otherwise, where $w_{th}$ is a threshold value
|
||||
somehow defined. In general, the projection onto one dimension leads to a
|
||||
considerable loss of information and classes that are well separated in the
|
||||
original 2D space may become strongly overlapping in one dimension. However, by
|
||||
adjusting the components of the weight vector, a projection that maximizes the
|
||||
classes separation can be selected [@bishop06].
|
||||
To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class
|
||||
$C_2$, so that the means $m_1$ and $m_2$ of the two classes are given by:
|
||||
|
||||
$C_2$, so that the means $\mu_1$ and $\mu_2$ of the two classes are given by:
|
||||
$$
|
||||
m_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
|
||||
\mu_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
|
||||
\et
|
||||
m_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
|
||||
\mu_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
|
||||
$$
|
||||
|
||||
The simplest measure of the separation of the classes is the separation of the
|
||||
projected class means. This suggests to choose $w$ so as to maximize:
|
||||
|
||||
$$
|
||||
\hat{m}_2 − \hat{m}_1 = w^T (m_2 − m_1)
|
||||
\hat{\mu}_2 − \hat{\mu}_1 = w^T (\mu_2 − \mu_1)
|
||||
$$
|
||||
|
||||
This expression can be made arbitrarily large simply by increasing the magnitude
|
||||
of $w$. To solve this problem, $w$ can be constrained to have unit length, so
|
||||
that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained
|
||||
maximization, it can be found that $w \propto (m_2 − m_1)$.
|
||||
|
||||
![The plot on the left shows samples from two classes along with the histograms
|
||||
resulting from projection onto the line joining the class means: note that
|
||||
there is considerable overlap in the projected space. The right plot shows the
|
||||
corresponding projection based on the Fisher linear discriminant, showing the
|
||||
greatly improved classes separation.](images/7-fisher.png){#fig:overlap}
|
||||
|
||||
maximization, it can be found that $w \propto (\mu_2 − \mu_1)$, meaning that the
|
||||
line onto the points must be projected is the one joining the class means.
|
||||
There is still a problem with this approach, however, as illustrated in
|
||||
@fig:overlap: the two classes are well separated in the original 2D space but
|
||||
have considerable overlap when projected onto the line joining their means.
|
||||
have considerable overlap when projected onto the line joining their means
|
||||
which maximize their projections distance.
|
||||
|
||||
![The plot on the left shows samples from two classes along with the
|
||||
histograms resulting fromthe projection onto the line joining the
|
||||
class means: note that there is considerable overlap in the projected
|
||||
space. The right plot shows the corresponding projection based on the
|
||||
Fisher linear discriminant, showing the greatly improved classes
|
||||
separation. Fifure from [@bishop06]](images/7-fisher.png){#fig:overlap}
|
||||
|
||||
The idea to solve it is to maximize a function that will give a large separation
|
||||
between the projected classes means while also giving a small variance within
|
||||
each class, thereby minimizing the class overlap.
|
||||
The within-classes variance of the transformed data of each class $k$ is given
|
||||
The within-class variance of the transformed data of each class $k$ is given
|
||||
by:
|
||||
|
||||
$$
|
||||
s_k^2 = \sum_{n \in C_k} (\hat{x}_n - \hat{m}_k)^2
|
||||
\hat{s}_k^2 = \sum_{n \in c_k} (\hat{x}_n - \hat{\mu}_k)^2
|
||||
$$
|
||||
|
||||
The total within-classes variance for the whole data set can be simply defined
|
||||
as $s^2 = s_1^2 + s_2^2$. The Fisher criterion is therefore defined to be the
|
||||
ratio of the between-classes distance to the within-classes variance and is
|
||||
The total within-class variance for the whole data set is simply defined as
|
||||
$\hat{s}^2 = \hat{s}_1^2 + \hat{s}_2^2$. The Fisher criterion is defined to
|
||||
be the ratio of the between-class distance to the within-class variance and is
|
||||
given by:
|
||||
|
||||
$$
|
||||
J(w) = \frac{(\hat{m}_2 - \hat{m}_1)^2}{s^2}
|
||||
F(w) = \frac{(\hat{\mu}_2 - \hat{\mu}_1)^2}{\hat{s}^2}
|
||||
$$
|
||||
|
||||
Differentiating $J(w)$ with respect to $w$, it can be found that it is
|
||||
maximized when:
|
||||
The dependence on $w$ can be made explicit:
|
||||
\begin{align*}
|
||||
(\hat{\mu}_2 - \hat{\mu}_1)^2 &= (w^T \mu_2 - w^T \mu_1)^2 \\
|
||||
&= [w^T (\mu_2 - \mu_1)]^2 \\
|
||||
&= [w^T (\mu_2 - \mu_1)][w^T (\mu_2 - \mu_1)] \\
|
||||
&= [w^T (\mu_2 - \mu_1)][(\mu_2 - \mu_1)^T w]
|
||||
= w^T M w
|
||||
\end{align*}
|
||||
|
||||
where $M$ is the between-distance matrix. Similarly, as regards the denominator:
|
||||
\begin{align*}
|
||||
\hat{s}^2 &= \hat{s}_1^2 + \hat{s}_2^2 = \\
|
||||
&= \sum_{n \in c_1} (\hat{x}_n - \hat{\mu}_1)^2
|
||||
+ \sum_{n \in c_2} (\hat{x}_n - \hat{\mu}_2)^2
|
||||
= w^T \Sigma_w w
|
||||
\end{align*}
|
||||
|
||||
where $\Sigma_w$ is the total within-class covariance matrix:
|
||||
\begin{align*}
|
||||
\Sigma_w &= \sum_{n \in c_1} (x_n − \mu_1)(x_n − \mu_1)^T
|
||||
+ \sum_{n \in c_2} (x_n − \mu_2)(x_n − \mu_2)^T \\
|
||||
&= \Sigma_1 + \Sigma_2
|
||||
= \begin{pmatrix}
|
||||
\sigma_x^2 & \sigma_{xy} \\
|
||||
\sigma_{xy} & \sigma_y^2
|
||||
\end{pmatrix}_1 +
|
||||
\begin{pmatrix}
|
||||
\sigma_x^2 & \sigma_{xy} \\
|
||||
\sigma_{xy} & \sigma_y^2
|
||||
\end{pmatrix}_2
|
||||
\end{align*}
|
||||
|
||||
Where $\Sigma_1$ and $\Sigma_2$ are the covariance matrix of the two samples.
|
||||
The Fisher criterion can therefore be rewritten in the form:
|
||||
$$
|
||||
w = S_b^{-1} (m_2 - m_1)
|
||||
F(w) = \frac{w^T M w}{w^T \Sigma_w w}
|
||||
$$
|
||||
|
||||
where $S_b$ is the covariance matrix, given by:
|
||||
|
||||
Differentiating with respect to $w$, it can be found that $F(w)$ is maximized
|
||||
when:
|
||||
$$
|
||||
S_b = S_1 + S_2
|
||||
w = \Sigma_w^{-1} (\mu_2 - \mu_1)
|
||||
$$
|
||||
|
||||
where $S_1$ and $S_2$ are the covariance matrix of the two classes, namely:
|
||||
|
||||
$$
|
||||
\begin{pmatrix}
|
||||
\sigma_x^2 & \sigma_{xy} \\
|
||||
\sigma_{xy} & \sigma_y^2
|
||||
\end{pmatrix}
|
||||
$$
|
||||
|
||||
This is not truly a discriminant but rather a specific choice of direction for
|
||||
projection of the data down to one dimension: the projected data can then be
|
||||
This is not truly a discriminant but rather a specific choice of the direction
|
||||
for projection of the data down to one dimension: the projected data can then be
|
||||
used to construct a discriminant by choosing a threshold for the
|
||||
classification.
|
||||
|
||||
When implemented, the parameters given in @sec:sampling were used to compute
|
||||
the covariance matrices $S_1$ and $S_2$ of the two classes and their sum $S$.
|
||||
Then $S$, being a symmetrical and positive-definite matrix, was inverted with
|
||||
the Cholesky method, already discussed in @sec:MLM.
|
||||
Lastly, the matrix-vector product was computed with the `gsl_blas_dgemv()`
|
||||
function provided by GSL.
|
||||
the covariance matrices and their sum $\Sigma_w$. Then $\Sigma_w$, being a
|
||||
symmetrical and positive-definite matrix, was inverted with the Cholesky method,
|
||||
already discussed in @sec:MLM. Lastly, the matrix-vector product was computed
|
||||
with the `gsl_blas_dgemv()` function provided by GSL.
|
||||
|
||||
|
||||
### The threshold
|
||||
|
||||
The cut was fixed by the condition of conditional probability being the same
|
||||
for each class:
|
||||
|
||||
The threshold $t_{\text{cut}}$ was fixed by the condition of conditional
|
||||
probability $P(c_k | t_{\text{cut}})$ being the same for both classes $c_k$:
|
||||
$$
|
||||
t_{\text{cut}} = x \, | \hspace{20pt}
|
||||
\frac{P(c_1 | x)}{P(c_2 | x)} =
|
||||
\frac{p(x | c_1) \, p(c_1)}{p(x | c_1) \, p(c_2)} = 1
|
||||
\frac{P(x | c_1) \, P(c_1)}{P(x | c_1) \, P(c_2)} = 1
|
||||
$$
|
||||
|
||||
where $p(x | c_k)$ is the probability for point $x$ along the Fisher projection
|
||||
line of belonging to the class $k$. If the classes are bivariate Gaussian, as
|
||||
in the present case, then $p(x | c_k)$ is simply given by its projected normal
|
||||
distribution $\mathscr{G} (\hat{μ}, \hat{S})$. With a bit of math, the solution
|
||||
is then:
|
||||
|
||||
where $P(x | c_k)$ is the probability for point $x$ along the Fisher projection
|
||||
line of being sampled according to the class $k$. If each class is a bivariate
|
||||
Gaussian, as in the present case, then $P(x | c_k)$ is simply given by its
|
||||
projected normal distribution with mean $\hat{m} = w^T m$ and variance $\hat{s}
|
||||
= w^T S w$, being $S$ the covariance matrix of the class.
|
||||
With a bit of math, the following solution can be found:
|
||||
$$
|
||||
t = \frac{b}{a} + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
|
||||
t_{\text{cut}} = \frac{b}{a}
|
||||
+ \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
|
||||
$$
|
||||
|
||||
where:
|
||||
|
||||
- $a = \hat{S}_1^2 - \hat{S}_2^2$
|
||||
- $b = \hat{m}_2 \, \hat{S}_1^2 - \hat{M}_1 \, \hat{S}_2^2$
|
||||
- $c = \hat{M}_2^2 \, \hat{S}_1^2 - \hat{M}_1^2 \, \hat{S}_2^2
|
||||
- 2 \, \hat{S}_1^2 \, \hat{S}_2^2 \, \ln(\alpha)$
|
||||
- $\alpha = p(c_1) / p(c_2)$
|
||||
|
||||
The ratio of the prior probability $\alpha$ was computed as:
|
||||
- $a = \hat{s}_1^2 - \hat{s}_2^2$
|
||||
- $b = \hat{\mu}_2 \, \hat{s}_1^2 - \hat{\mu}_1 \, \hat{s}_2^2$
|
||||
- $c = \hat{\mu}_2^2 \, \hat{s}_1^2 - \hat{\mu}_1^2 \, \hat{s}_2^2
|
||||
- 2 \, \hat{s}_1^2 \, \hat{s}_2^2 \, \ln(\alpha)$
|
||||
- $\alpha = P(c_1) / P(c_2)$
|
||||
|
||||
The ratio of the prior probabilities $\alpha$ is simply given by:
|
||||
$$
|
||||
\alpha = \frac{N_s}{N_n}
|
||||
$$
|
||||
|
||||
The projection of the points was accomplished by the use of the function
|
||||
`gsl_blas_ddot()`, which computed a dot product between two vectors, which in
|
||||
this case were the weight vector and the position of the point to be projected.
|
||||
`gsl_blas_ddot()`, which computes the element wise product between two vectors.
|
||||
|
||||
Results obtained for the same samples in @fig:points are shown in
|
||||
@fig:fisher_proj. The weight vector and the treshold were found to be:
|
||||
$$
|
||||
w = (0.707, 0.707) \et
|
||||
t_{\text{cut}} = 1.323
|
||||
$$
|
||||
|
||||
<div id="fig:fisher_proj">
|
||||
![View from above of the samples.](images/7-fisher-plane.pdf){height=5.7cm}
|
||||
![Gaussian of the samples on the projection
|
||||
line.](images/7-fisher-proj.pdf){height=5.7cm}
|
||||
![View of the samples in the plane.](images/7-fisher-plane.pdf)
|
||||
![View of the samples projections onto the projection
|
||||
line.](images/7-fisher-proj.pdf)
|
||||
|
||||
Aerial and lateral views of the projection direction, in blue, and the cut, in
|
||||
red.
|
||||
Aerial and lateral views of the samples. Projection line in blu and cut in red.
|
||||
</div>
|
||||
|
||||
Results obtained for the same sample in @fig:points are shown in
|
||||
@fig:fisher_proj. The weight vector $w$ was found to be:
|
||||
|
||||
Since the vector $w$ turned out to be parallel with the line joining the means
|
||||
of the two classes (reminded to be $(0, 0)$ and $(4, 4)$), one can be mislead
|
||||
and assume that the inverse of the total covariance matrix $\Sigma_w$ is
|
||||
isotropic, namely proportional to the unit matrix.
|
||||
That's not true. In this special sample, the vector joining the means turns out
|
||||
to be an eigenvector of the covariance matrix $\Sigma_w^{-1}$. In fact: since
|
||||
$\sigma_x = \sigma_y$ for both signal and noise:
|
||||
$$
|
||||
w = (0.707, 0.707)
|
||||
\Sigma_1 = \begin{pmatrix}
|
||||
\sigma_x^2 & \sigma_{xy} \\
|
||||
\sigma_{xy} & \sigma_x^2
|
||||
\end{pmatrix}_1
|
||||
\et
|
||||
\Sigma_2 = \begin{pmatrix}
|
||||
\sigma_x^2 & \sigma_{xy} \\
|
||||
\sigma_{xy} & \sigma_x^2
|
||||
\end{pmatrix}_2
|
||||
$$
|
||||
|
||||
and $t_{\text{cut}}$ is 1.323 far from the origin of the axes. Hence, as can be
|
||||
seen, the vector $w$ turned out to be parallel to the line joining the means of
|
||||
the two classes (reminded to be $(0, 0)$ and $(4, 4)$) which means that the
|
||||
total covariance matrix $S$ is isotropic, proportional to the unit matrix.
|
||||
$\Sigma_w$ takes the form:
|
||||
$$
|
||||
\Sigma_w = \begin{pmatrix}
|
||||
A & B \\
|
||||
B & A
|
||||
\end{pmatrix}
|
||||
$$
|
||||
|
||||
Which can be easily inverted by Gaussian elimination:
|
||||
\begin{align*}
|
||||
\begin{pmatrix}
|
||||
A & B & \vline & 1 & 0 \\
|
||||
B & A & \vline & 0 & 1 \\
|
||||
\end{pmatrix} &\longrightarrow
|
||||
\begin{pmatrix}
|
||||
A - B & 0 & \vline & 1 - B & - B \\
|
||||
0 & A - B & \vline & - B & 1 - B \\
|
||||
\end{pmatrix} \\ &\longrightarrow
|
||||
\begin{pmatrix}
|
||||
1 & 0 & \vline & (1 - B)/(A - B) & - B/(A - B) \\
|
||||
0 & 1 & \vline & - B/(A - B) & (1 - B)/(A - B) \\
|
||||
\end{pmatrix}
|
||||
\end{align*}
|
||||
|
||||
Hence:
|
||||
$$
|
||||
\Sigma_w^{-1} = \begin{pmatrix}
|
||||
\tilde{A} & \tilde{B} \\
|
||||
\tilde{B} & \tilde{A}
|
||||
\end{pmatrix}
|
||||
$$
|
||||
|
||||
Thus, $\Sigma_w$ and $\Sigma_w^{-1}$ share the same eigenvectors $v_1$ and
|
||||
$v_2$:
|
||||
$$
|
||||
v_1 = \begin{pmatrix}
|
||||
1 \\
|
||||
-1
|
||||
\end{pmatrix} \et
|
||||
v_2 = \begin{pmatrix}
|
||||
1 \\
|
||||
1
|
||||
\end{pmatrix}
|
||||
$$
|
||||
|
||||
and the vector joining the means is clearly a multiple of $v_2$, causing $w$ to
|
||||
be a multiple of it.
|
||||
|
||||
|
||||
## Perceptron
|
||||
|
||||
In machine learning, the perceptron is an algorithm for supervised learning of
|
||||
linear binary classifiers.
|
||||
|
||||
Supervised learning is the machine learning task of inferring a function $f$
|
||||
that maps an input $x$ to an output $f(x)$ based on a set of training
|
||||
input-output pairs. Each example is a pair consisting of an input object and an
|
||||
output value. The inferred function can be used for mapping new examples. The
|
||||
algorithm will be generalized to correctly determine the class labels for unseen
|
||||
instances.
|
||||
|
||||
The aim is to determine the bias $b$ such that the threshold function $f(x)$:
|
||||
input-output pairs, where each pair consists of an input object and an output
|
||||
value. The inferred function can be used for mapping new examples: the algorithm
|
||||
is generalized to correctly determine the class labels for unseen instances.
|
||||
|
||||
The aim of the perceptron algorithm is to determine the weight vector $w$ and
|
||||
bias $b$ such that the so-called 'threshold function' $f(x)$ returns a binary
|
||||
value: it is expected to return 1 for signal points and 0 for noise points:
|
||||
$$
|
||||
f(x) = x \cdot w + b \hspace{20pt}
|
||||
\begin{cases}
|
||||
\geqslant 0 \incase x \in \text{signal} \\
|
||||
< 0 \incase x \in \text{noise}
|
||||
\end{cases}
|
||||
f(x) = \theta(w^T \cdot x + b)
|
||||
$$ {#eq:perc}
|
||||
|
||||
The training was performed as follow. Initial values were set as $w = (0,0)$ and
|
||||
$b = 0$. From these, the perceptron starts to improve their estimations. The
|
||||
sample was passed point by point into a iterative procedure a grand total of
|
||||
$N_c$ calls: each time, the projection $w \cdot x$ of the point was computed
|
||||
and then the variable $\Delta$ was defined as:
|
||||
|
||||
where $\theta$ is the Heaviside theta function.
|
||||
The training was performed using the generated sample as training set. From an
|
||||
initial guess for $w$ and $b$ (which were set to be all null in the code), the
|
||||
perceptron starts to improve their estimations. The training set is passed point
|
||||
by point into a iterative procedure a customizable number $N$ of times: for
|
||||
every point, the output of $f(x)$ is computed. Afterwards, the variable
|
||||
$\Delta$, which is defined as:
|
||||
$$
|
||||
\Delta = r * (e - \theta (f(x))
|
||||
\Delta = r [e - f(x)]
|
||||
$$
|
||||
|
||||
where:
|
||||
|
||||
- $r$ is the learning rate of the perceptron: it is between 0 and 1. The
|
||||
larger $r$, the more volatile the weight changes. In the code, it was set
|
||||
$r = 0.8$;
|
||||
- $e$ is the expected value, namely 0 if $x$ is noise and 1 if it is signal;
|
||||
- $\theta$ is the Heaviside theta function;
|
||||
- $o$ is the observed value of $f(x)$ defined in @eq:perc.
|
||||
- $r \in [0, 1]$ is the learning rate of the perceptron: the larger $r$, the
|
||||
more volatile the weight changes. In the code it was arbitrarily set $r =
|
||||
0.8$;
|
||||
- $e$ is the expected output value, namely 1 if $x$ is signal and 0 if it is
|
||||
noise;
|
||||
|
||||
Then $b$ and $w$ must be updated as:
|
||||
is used to update $b$ and $w$:
|
||||
|
||||
$$
|
||||
b \to b + \Delta
|
||||
\et
|
||||
w \to w + x \Delta
|
||||
w \to w + \Delta x
|
||||
$$
|
||||
|
||||
<div id="fig:percep_proj">
|
||||
![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
|
||||
![Gaussian of the samples on the projection
|
||||
line.](images/7-percep-proj.pdf){height=5.7cm}
|
||||
To see how it works, consider the four possible situations:
|
||||
|
||||
Aerial and lateral views of the projection direction, in blue, and the cut, in
|
||||
red.
|
||||
</div>
|
||||
- $e = 1 \quad \wedge \quad f(x) = 1 \quad \dot \vee \quad e = 0 \quad \wedge
|
||||
\quad f(x) = 0 \quad \Longrightarrow \quad \Delta = 0$
|
||||
the current estimations work properly: $b$ and $w$ do not need to be updated;
|
||||
- $e = 1 \quad \wedge \quad f(x) = 0 \quad \Longrightarrow \quad
|
||||
\Delta = 1$
|
||||
the current $b$ and $w$ underestimate the correct output: they must be
|
||||
increased;
|
||||
- $e = 0 \quad \wedge \quad f(x) = 1 \quad \Longrightarrow \quad
|
||||
\Delta = -1$
|
||||
the current $b$ and $w$ overestimate the correct output: they must be
|
||||
decreased.
|
||||
|
||||
It can be shown that this method converges to the coveted function.
|
||||
As stated in the previous section, the weight vector must finally be normalized.
|
||||
Whilst the $b$ updating is obvious, as regarsd $w$ the following consideration
|
||||
may help clarify. Consider the case with $e = 0 \quad \wedge \quad f(x) = 1
|
||||
\quad \Longrightarrow \quad \Delta = -1$:
|
||||
$$
|
||||
w^T \cdot x \to (w^T + \Delta x^T) \cdot x
|
||||
= w^T \cdot x + \Delta |x|^2
|
||||
= w^T \cdot x - |x|^2 \leq w^T \cdot x
|
||||
$$
|
||||
|
||||
With $N_c = 5$, the values of $w$ and $t_{\text{cut}}$ level off up to the third
|
||||
Similarly for the case with $e = 1$ and $f(x) = 0$.
|
||||
|
||||
As far as convergence is concerned, the perceptron will never get to the state
|
||||
with all the input points classified correctly if the training set is not
|
||||
linearly separable, meaning that the signal cannot be separated from the noise
|
||||
by a line in the plane. In this case, no approximate solutions will be gradually
|
||||
approached. On the other hand, if the training set is linearly separable, it can
|
||||
be shown that this method converges to the coveted function [@novikoff63].
|
||||
As in the previous section, once found, the weight vector is to be normalized.
|
||||
|
||||
With $N = 5$ iterations, the values of $w$ and $t_{\text{cut}}$ level off up to the third
|
||||
digit. The following results were obtained:
|
||||
|
||||
$$
|
||||
@ -287,7 +385,16 @@ where, once again, $t_{\text{cut}}$ is computed from the origin of the axes. In
|
||||
this case, the projection line does not lies along the mains of the two
|
||||
samples. Plots in @fig:percep_proj.
|
||||
|
||||
## Efficiency test
|
||||
<div id="fig:percep_proj">
|
||||
![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
|
||||
![Gaussian of the samples on the projection
|
||||
line.](images/7-percep-proj.pdf){height=5.7cm}
|
||||
|
||||
Aerial and lateral views of the projection direction, in blue, and the cut, in
|
||||
red.
|
||||
</div>
|
||||
|
||||
## Efficiency test {#sec:7_results}
|
||||
|
||||
A program was implemented to check the validity of the two
|
||||
classification methods.
|
||||
|
@ -1,3 +1,6 @@
|
||||
- riscrivere il readme di: 1, 2, 6
|
||||
- rileggere il 7
|
||||
- completare il 2
|
||||
- riscrivere il 2
|
||||
|
||||
On the lambert W function, formula 4.19 Corless
|
||||
|
Loading…
Reference in New Issue
Block a user