ex-7: revised and typo-fixed
In addition, the folder ex-7/iters was created in order to plot the results of the Perceptron method as a function of the iterations parameter.
This commit is contained in:
parent
90ab77b676
commit
7ad45a829e
@ -39,9 +39,9 @@ gsl_vector* normal_mean(struct par *p) {
|
|||||||
* between the two classes.
|
* between the two classes.
|
||||||
* The projection vector w is given by
|
* The projection vector w is given by
|
||||||
*
|
*
|
||||||
* w = Sw⁻¹ (μ₂ - μ₁)
|
* w = Σ_w⁻¹ (μ₂ - μ₁)
|
||||||
*
|
*
|
||||||
* where Sw = Σ₁ + Σ₂ is the so-called within-class
|
* where Σ_w = Σ₁ + Σ₂ is the so-called within-class
|
||||||
* covariance matrix.
|
* covariance matrix.
|
||||||
*/
|
*/
|
||||||
gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
||||||
@ -54,10 +54,11 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
|||||||
gsl_vector *mu2 = normal_mean(&c2->p);
|
gsl_vector *mu2 = normal_mean(&c2->p);
|
||||||
|
|
||||||
/* Compute the inverse of the within-class
|
/* Compute the inverse of the within-class
|
||||||
* covariance Sw⁻¹.
|
* covariance Σ_w⁻¹.
|
||||||
* Note: by definition Σ is symmetrical and
|
* Note: by definition Σ is symmetrical and
|
||||||
* positive-definite, so Cholesky is appropriate.
|
* positive-definite, so Cholesky is appropriate.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
gsl_matrix_add(cov1, cov2);
|
gsl_matrix_add(cov1, cov2);
|
||||||
gsl_linalg_cholesky_decomp(cov1);
|
gsl_linalg_cholesky_decomp(cov1);
|
||||||
gsl_linalg_cholesky_invert(cov1);
|
gsl_linalg_cholesky_invert(cov1);
|
||||||
@ -67,7 +68,7 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
|
|||||||
gsl_vector_memcpy(diff, mu2);
|
gsl_vector_memcpy(diff, mu2);
|
||||||
gsl_vector_sub(diff, mu1);
|
gsl_vector_sub(diff, mu1);
|
||||||
|
|
||||||
/* Finally multiply diff by Sw.
|
/* Finally multiply diff by Σ_w.
|
||||||
* This uses the rather low-level CBLAS
|
* This uses the rather low-level CBLAS
|
||||||
* functions gsl_blas_dgemv:
|
* functions gsl_blas_dgemv:
|
||||||
*
|
*
|
||||||
|
25
ex-7/iters/iter.py
Normal file
25
ex-7/iters/iter.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
iter, w_x, w_y, b = np.loadtxt('ex-7/iters/iters.txt')
|
||||||
|
|
||||||
|
plt.figure(figsize=(5, 4))
|
||||||
|
plt.rcParams['font.size'] = 8
|
||||||
|
|
||||||
|
plt.subplot(211)
|
||||||
|
plt.title('weight vector', loc='right')
|
||||||
|
plt.plot(iter, w, color='#92182b')
|
||||||
|
plt.ylabel('w')
|
||||||
|
plt.xlabel('N')
|
||||||
|
|
||||||
|
plt.subplot(212)
|
||||||
|
plt.title('bias', loc='right')
|
||||||
|
plt.plot(iter, b, color='gray')
|
||||||
|
plt.ylabel('b')
|
||||||
|
plt.xlabel('N')
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
11
ex-7/iters/iters.txt
Normal file
11
ex-7/iters/iters.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#iters w_x w_y b b
|
||||||
|
1 0.427 0.904 1.749
|
||||||
|
2 0.566 0.824 1.445
|
||||||
|
3 0.654 0.756 1.213
|
||||||
|
4 0.654 0.756 1.213
|
||||||
|
5 0.654 0.756 1.213
|
||||||
|
7 0.654 0.756 1.213
|
||||||
|
6 0.654 0.756 1.213
|
||||||
|
8 0.654 0.756 1.213
|
||||||
|
9 0.654 0.756 1.213
|
||||||
|
10 0.654 0.756 1.213
|
13
ex-7/main.c
13
ex-7/main.c
@ -95,7 +95,7 @@ int main(int argc, char **argv) {
|
|||||||
* dataset to get an approximate
|
* dataset to get an approximate
|
||||||
* solution in `iter` iterations.
|
* solution in `iter` iterations.
|
||||||
*/
|
*/
|
||||||
fputs("# Perceptron \n\n", stderr);
|
// fputs("# Perceptron \n\n", stderr);
|
||||||
w = percep_train(signal, noise, opts.iter, &cut);
|
w = percep_train(signal, noise, opts.iter, &cut);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -107,20 +107,21 @@ int main(int argc, char **argv) {
|
|||||||
/* Print the results of the method
|
/* Print the results of the method
|
||||||
* selected: weights and threshold.
|
* selected: weights and threshold.
|
||||||
*/
|
*/
|
||||||
|
fprintf(stderr, "\n* i: %d\n", opts.iter);
|
||||||
fprintf(stderr, "* w: [%.3f, %.3f]\n",
|
fprintf(stderr, "* w: [%.3f, %.3f]\n",
|
||||||
gsl_vector_get(w, 0),
|
gsl_vector_get(w, 0),
|
||||||
gsl_vector_get(w, 1));
|
gsl_vector_get(w, 1));
|
||||||
fprintf(stderr, "* cut: %.3f\n", cut);
|
fprintf(stderr, "* cut: %.3f\n", cut);
|
||||||
gsl_vector_fprintf(stdout, w, "%g");
|
// gsl_vector_fprintf(stdout, w, "%g");
|
||||||
printf("%f\n", cut);
|
// printf("%f\n", cut);
|
||||||
|
|
||||||
/* Print data to stdout for plotting.
|
/* Print data to stdout for plotting.
|
||||||
* Note: we print the sizes to be able
|
* Note: we print the sizes to be able
|
||||||
* to set apart the two matrices.
|
* to set apart the two matrices.
|
||||||
*/
|
*/
|
||||||
printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
|
// printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
|
||||||
gsl_matrix_fprintf(stdout, signal->data, "%g");
|
// gsl_matrix_fprintf(stdout, signal->data, "%g");
|
||||||
gsl_matrix_fprintf(stdout, noise->data, "%g");
|
// gsl_matrix_fprintf(stdout, noise->data, "%g");
|
||||||
|
|
||||||
// free memory
|
// free memory
|
||||||
gsl_rng_free(r);
|
gsl_rng_free(r);
|
||||||
|
17
ex-7/plot.py
17
ex-7/plot.py
@ -7,7 +7,6 @@ def line(x, y, **args):
|
|||||||
'''line between two points x,y'''
|
'''line between two points x,y'''
|
||||||
plot([x[0], y[0]], [x[1], y[1]], **args)
|
plot([x[0], y[0]], [x[1], y[1]], **args)
|
||||||
|
|
||||||
rcParams['font.size'] = 12
|
|
||||||
w = loadtxt(sys.stdin, max_rows=2)
|
w = loadtxt(sys.stdin, max_rows=2)
|
||||||
v = array([[0, -1], [1, 0]]) @ w
|
v = array([[0, -1], [1, 0]]) @ w
|
||||||
cut = float(input())
|
cut = float(input())
|
||||||
@ -16,7 +15,11 @@ n, m, d = map(int, input().split())
|
|||||||
data = loadtxt(sys.stdin).reshape(n + m, d)
|
data = loadtxt(sys.stdin).reshape(n + m, d)
|
||||||
signal, noise = data[:n].T, data[n:].T
|
signal, noise = data[:n].T, data[n:].T
|
||||||
|
|
||||||
|
plt.figure(figsize=(3, 3))
|
||||||
|
rcParams['font.size'] = 8
|
||||||
figure()
|
figure()
|
||||||
|
figure(figsize=(3, 3))
|
||||||
|
rcParams['font.size'] = 8
|
||||||
subplot(aspect='equal')
|
subplot(aspect='equal')
|
||||||
scatter(*signal, edgecolor='xkcd:charcoal',
|
scatter(*signal, edgecolor='xkcd:charcoal',
|
||||||
c='xkcd:dark yellow', label='signal')
|
c='xkcd:dark yellow', label='signal')
|
||||||
@ -24,18 +27,22 @@ scatter(*noise, edgecolor='xkcd:charcoal',
|
|||||||
c='xkcd:pale purple', label='noise')
|
c='xkcd:pale purple', label='noise')
|
||||||
line(-20*w, 20*w, c='xkcd:midnight blue', label='projection')
|
line(-20*w, 20*w, c='xkcd:midnight blue', label='projection')
|
||||||
line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut')
|
line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut')
|
||||||
|
xlabel('x')
|
||||||
|
ylabel('y')
|
||||||
xlim(-1.5, 8)
|
xlim(-1.5, 8)
|
||||||
ylim(-1.5, 8)
|
ylim(-1.5, 8)
|
||||||
legend()
|
legend()
|
||||||
tight_layout()
|
tight_layout()
|
||||||
|
savefig('notes/images/7-fisher-plane.pdf')
|
||||||
|
|
||||||
figure()
|
plt.figure(figsize=(3, 3))
|
||||||
|
rcParams['font.size'] = 8
|
||||||
sig_proj = np.dot(w, signal)
|
sig_proj = np.dot(w, signal)
|
||||||
noise_proj = np.dot(w, noise)
|
noise_proj = np.dot(w, noise)
|
||||||
hist(sig_proj, color='xkcd:dark yellow', label='signal')
|
hist(sig_proj, color='xkcd:dark yellow', label='signal')
|
||||||
hist(noise_proj, color='xkcd:pale purple', label='noise')
|
hist(noise_proj, color='xkcd:pale purple', label='noise')
|
||||||
axvline(cut, c='xkcd:scarlet')
|
axvline(cut, c='xkcd:scarlet', label='cut')
|
||||||
|
xlabel('projection line')
|
||||||
legend()
|
legend()
|
||||||
tight_layout()
|
tight_layout()
|
||||||
|
savefig('notes/images/7-fisher-proj.pdf')
|
||||||
show()
|
|
||||||
|
@ -140,17 +140,32 @@
|
|||||||
|
|
||||||
@book{hecht02,
|
@book{hecht02,
|
||||||
title={Optics},
|
title={Optics},
|
||||||
|
author={Eugene Hecht},
|
||||||
year={2002},
|
year={2002},
|
||||||
publisher={Pearson},
|
publisher={Pearson}
|
||||||
author={Eugene Hecht}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@article{lucy74,
|
@article{lucy74,
|
||||||
title={An iterative technique for the rectification of observed
|
title={An iterative technique for the rectification of observed
|
||||||
distributions},
|
distributions},
|
||||||
author={Lucy, Leon B},
|
author={Lucy, Leon B.},
|
||||||
journal={The astronomical journal},
|
journal={The astronomical journal},
|
||||||
volume={79},
|
volume={79},
|
||||||
pages={745},
|
pages={745},
|
||||||
year={1974}
|
year={1974}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@book{bishop06,
|
||||||
|
title={Pattern Recognition and Machine Learning},
|
||||||
|
author={Bishop, Christopher M.},
|
||||||
|
year={2006},
|
||||||
|
pages={186 -- 189},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@techreport{novikoff63,
|
||||||
|
title={On convergence proofs for perceptrons},
|
||||||
|
author={Novikoff, Albert B},
|
||||||
|
year={1963},
|
||||||
|
institution={Stanford Researhc INST Menlo Park CA}
|
||||||
|
}
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -2,9 +2,8 @@
|
|||||||
|
|
||||||
## Generating points according to Gaussian distributions {#sec:sampling}
|
## Generating points according to Gaussian distributions {#sec:sampling}
|
||||||
|
|
||||||
The first task of exercise 7 is to generate two sets of 2D points $(x, y)$
|
Two sets of 2D points $(x, y)$ - signal and noise - is to be generated according
|
||||||
according to two bivariate Gaussian distributions with parameters:
|
to two bivariate Gaussian distributions with parameters:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
\text{signal} \quad
|
\text{signal} \quad
|
||||||
\begin{cases}
|
\begin{cases}
|
||||||
@ -21,262 +20,361 @@ $$
|
|||||||
\end{cases}
|
\end{cases}
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ are the standard
|
where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ for the standard
|
||||||
deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate
|
deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate
|
||||||
correlation, hence:
|
correlation, namely:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
\sigma_{xy} = \rho \sigma_x \sigma_y
|
\sigma_{xy} = \rho \sigma_x \sigma_y
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where $\sigma_{xy}$ is the covariance of $x$ and $y$.
|
where $\sigma_{xy}$ is the covariance of $x$ and $y$.
|
||||||
In the code, default settings are $N_s = 800$ points for the signal and $N_n =
|
In the code, default settings are $N_s = 800$ points for the signal and $N_n =
|
||||||
1000$ points for the noise but can be changed from the command-line. Both
|
1000$ points for the noise but can be customized from the input command-line.
|
||||||
samples were handled as matrices of dimension $n$ x 2, where $n$ is the number
|
Both samples were handled as matrices of dimension $n$ x 2, where $n$ is the
|
||||||
of points in the sample. The library `gsl_matrix` provided by GSL was employed
|
number of points in the sample. The library `gsl_matrix` provided by GSL was
|
||||||
for this purpose and the function `gsl_ran_bivariate_gaussian()` was used for
|
employed for this purpose and the function `gsl_ran_bivariate_gaussian()` was
|
||||||
generating the points.
|
used for generating the points.
|
||||||
An example of the two samples is shown in @fig:points.
|
An example of the two samples is shown in @fig:points.
|
||||||
|
|
||||||
![Example of points sorted according to two Gaussian with
|
![Example of points sampled according to the two Gaussian distributions
|
||||||
the given parameters. Noise points in pink and signal points
|
with the given parameters.](images/7-points.pdf){#fig:points}
|
||||||
in yellow.](images/7-points.pdf){#fig:points}
|
|
||||||
|
|
||||||
Assuming not to know how the points were generated, a model of classification
|
Assuming not to know how the points were generated, a model of classification
|
||||||
must then be implemented in order to assign each point to the right class
|
is then to be implemented in order to assign each point to the right class
|
||||||
(signal or noise) to which it 'most probably' belongs to. The point is how
|
(signal or noise) to which it 'most probably' belongs to. The point is how
|
||||||
'most probably' can be interpreted and implemented.
|
'most probably' can be interpreted and implemented.
|
||||||
|
Here, the Fisher linear discriminant and the Perceptron were implemented and
|
||||||
|
described in the following two sections. The results are compared in
|
||||||
|
@sec:7_results.
|
||||||
|
|
||||||
|
|
||||||
## Fisher linear discriminant
|
## Fisher linear discriminant
|
||||||
|
|
||||||
|
|
||||||
### The projection direction
|
### The projection direction
|
||||||
|
|
||||||
The Fisher linear discriminant (FLD) is a linear classification model based on
|
The Fisher linear discriminant (FLD) is a linear classification model based on
|
||||||
dimensionality reduction. It allows to reduce this 2D classification problem
|
dimensionality reduction. It allows to reduce this 2D classification problem
|
||||||
into a one-dimensional decision surface.
|
into a one-dimensional decision surface.
|
||||||
|
|
||||||
Consider the case of two classes (in this case the signal and the noise): the
|
Consider the case of two classes (in this case signal and noise): the simplest
|
||||||
simplest representation of a linear discriminant is obtained by taking a linear
|
representation of a linear discriminant is obtained by taking a linear function
|
||||||
function of a sampled 2D point $x$ so that:
|
$\hat{x}$ of a sampled 2D point $x$ so that:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
\hat{x} = w^T x
|
\hat{x} = w^T x
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where $w$ is the so-called 'weight vector'. An input point $x$ is commonly
|
where $w$ is the so-called 'weight vector' and $w^T$ stands for its transpose.
|
||||||
assigned to the first class if $\hat{x} \geqslant w_{th}$ and to the second one
|
An input point $x$ is commonly assigned to the first class if $\hat{x} \geqslant
|
||||||
otherwise, where $w_{th}$ is a threshold value somehow defined.
|
w_{th}$ and to the second one otherwise, where $w_{th}$ is a threshold value
|
||||||
In general, the projection onto one dimension leads to a considerable loss of
|
somehow defined. In general, the projection onto one dimension leads to a
|
||||||
information and classes that are well separated in the original 2D space may
|
considerable loss of information and classes that are well separated in the
|
||||||
become strongly overlapping in one dimension. However, by adjusting the
|
original 2D space may become strongly overlapping in one dimension. However, by
|
||||||
components of the weight vector, a projection that maximizes the classes
|
adjusting the components of the weight vector, a projection that maximizes the
|
||||||
separation can be selected.
|
classes separation can be selected [@bishop06].
|
||||||
To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class
|
To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class
|
||||||
$C_2$, so that the means $m_1$ and $m_2$ of the two classes are given by:
|
$C_2$, so that the means $\mu_1$ and $\mu_2$ of the two classes are given by:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
m_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
|
\mu_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
|
||||||
\et
|
\et
|
||||||
m_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
|
\mu_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
|
||||||
$$
|
$$
|
||||||
|
|
||||||
The simplest measure of the separation of the classes is the separation of the
|
The simplest measure of the separation of the classes is the separation of the
|
||||||
projected class means. This suggests to choose $w$ so as to maximize:
|
projected class means. This suggests to choose $w$ so as to maximize:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
\hat{m}_2 − \hat{m}_1 = w^T (m_2 − m_1)
|
\hat{\mu}_2 − \hat{\mu}_1 = w^T (\mu_2 − \mu_1)
|
||||||
$$
|
$$
|
||||||
|
|
||||||
This expression can be made arbitrarily large simply by increasing the magnitude
|
This expression can be made arbitrarily large simply by increasing the magnitude
|
||||||
of $w$. To solve this problem, $w$ can be constrained to have unit length, so
|
of $w$. To solve this problem, $w$ can be constrained to have unit length, so
|
||||||
that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained
|
that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained
|
||||||
maximization, it can be found that $w \propto (m_2 − m_1)$.
|
maximization, it can be found that $w \propto (\mu_2 − \mu_1)$, meaning that the
|
||||||
|
line onto the points must be projected is the one joining the class means.
|
||||||
![The plot on the left shows samples from two classes along with the histograms
|
|
||||||
resulting from projection onto the line joining the class means: note that
|
|
||||||
there is considerable overlap in the projected space. The right plot shows the
|
|
||||||
corresponding projection based on the Fisher linear discriminant, showing the
|
|
||||||
greatly improved classes separation.](images/7-fisher.png){#fig:overlap}
|
|
||||||
|
|
||||||
There is still a problem with this approach, however, as illustrated in
|
There is still a problem with this approach, however, as illustrated in
|
||||||
@fig:overlap: the two classes are well separated in the original 2D space but
|
@fig:overlap: the two classes are well separated in the original 2D space but
|
||||||
have considerable overlap when projected onto the line joining their means.
|
have considerable overlap when projected onto the line joining their means
|
||||||
|
which maximize their projections distance.
|
||||||
|
|
||||||
|
![The plot on the left shows samples from two classes along with the
|
||||||
|
histograms resulting fromthe projection onto the line joining the
|
||||||
|
class means: note that there is considerable overlap in the projected
|
||||||
|
space. The right plot shows the corresponding projection based on the
|
||||||
|
Fisher linear discriminant, showing the greatly improved classes
|
||||||
|
separation. Fifure from [@bishop06]](images/7-fisher.png){#fig:overlap}
|
||||||
|
|
||||||
The idea to solve it is to maximize a function that will give a large separation
|
The idea to solve it is to maximize a function that will give a large separation
|
||||||
between the projected classes means while also giving a small variance within
|
between the projected classes means while also giving a small variance within
|
||||||
each class, thereby minimizing the class overlap.
|
each class, thereby minimizing the class overlap.
|
||||||
The within-classes variance of the transformed data of each class $k$ is given
|
The within-class variance of the transformed data of each class $k$ is given
|
||||||
by:
|
by:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
s_k^2 = \sum_{n \in C_k} (\hat{x}_n - \hat{m}_k)^2
|
\hat{s}_k^2 = \sum_{n \in c_k} (\hat{x}_n - \hat{\mu}_k)^2
|
||||||
$$
|
$$
|
||||||
|
|
||||||
The total within-classes variance for the whole data set can be simply defined
|
The total within-class variance for the whole data set is simply defined as
|
||||||
as $s^2 = s_1^2 + s_2^2$. The Fisher criterion is therefore defined to be the
|
$\hat{s}^2 = \hat{s}_1^2 + \hat{s}_2^2$. The Fisher criterion is defined to
|
||||||
ratio of the between-classes distance to the within-classes variance and is
|
be the ratio of the between-class distance to the within-class variance and is
|
||||||
given by:
|
given by:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
J(w) = \frac{(\hat{m}_2 - \hat{m}_1)^2}{s^2}
|
F(w) = \frac{(\hat{\mu}_2 - \hat{\mu}_1)^2}{\hat{s}^2}
|
||||||
$$
|
$$
|
||||||
|
|
||||||
Differentiating $J(w)$ with respect to $w$, it can be found that it is
|
The dependence on $w$ can be made explicit:
|
||||||
maximized when:
|
\begin{align*}
|
||||||
|
(\hat{\mu}_2 - \hat{\mu}_1)^2 &= (w^T \mu_2 - w^T \mu_1)^2 \\
|
||||||
|
&= [w^T (\mu_2 - \mu_1)]^2 \\
|
||||||
|
&= [w^T (\mu_2 - \mu_1)][w^T (\mu_2 - \mu_1)] \\
|
||||||
|
&= [w^T (\mu_2 - \mu_1)][(\mu_2 - \mu_1)^T w]
|
||||||
|
= w^T M w
|
||||||
|
\end{align*}
|
||||||
|
|
||||||
|
where $M$ is the between-distance matrix. Similarly, as regards the denominator:
|
||||||
|
\begin{align*}
|
||||||
|
\hat{s}^2 &= \hat{s}_1^2 + \hat{s}_2^2 = \\
|
||||||
|
&= \sum_{n \in c_1} (\hat{x}_n - \hat{\mu}_1)^2
|
||||||
|
+ \sum_{n \in c_2} (\hat{x}_n - \hat{\mu}_2)^2
|
||||||
|
= w^T \Sigma_w w
|
||||||
|
\end{align*}
|
||||||
|
|
||||||
|
where $\Sigma_w$ is the total within-class covariance matrix:
|
||||||
|
\begin{align*}
|
||||||
|
\Sigma_w &= \sum_{n \in c_1} (x_n − \mu_1)(x_n − \mu_1)^T
|
||||||
|
+ \sum_{n \in c_2} (x_n − \mu_2)(x_n − \mu_2)^T \\
|
||||||
|
&= \Sigma_1 + \Sigma_2
|
||||||
|
= \begin{pmatrix}
|
||||||
|
\sigma_x^2 & \sigma_{xy} \\
|
||||||
|
\sigma_{xy} & \sigma_y^2
|
||||||
|
\end{pmatrix}_1 +
|
||||||
|
\begin{pmatrix}
|
||||||
|
\sigma_x^2 & \sigma_{xy} \\
|
||||||
|
\sigma_{xy} & \sigma_y^2
|
||||||
|
\end{pmatrix}_2
|
||||||
|
\end{align*}
|
||||||
|
|
||||||
|
Where $\Sigma_1$ and $\Sigma_2$ are the covariance matrix of the two samples.
|
||||||
|
The Fisher criterion can therefore be rewritten in the form:
|
||||||
$$
|
$$
|
||||||
w = S_b^{-1} (m_2 - m_1)
|
F(w) = \frac{w^T M w}{w^T \Sigma_w w}
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where $S_b$ is the covariance matrix, given by:
|
Differentiating with respect to $w$, it can be found that $F(w)$ is maximized
|
||||||
|
when:
|
||||||
$$
|
$$
|
||||||
S_b = S_1 + S_2
|
w = \Sigma_w^{-1} (\mu_2 - \mu_1)
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where $S_1$ and $S_2$ are the covariance matrix of the two classes, namely:
|
This is not truly a discriminant but rather a specific choice of the direction
|
||||||
|
for projection of the data down to one dimension: the projected data can then be
|
||||||
$$
|
|
||||||
\begin{pmatrix}
|
|
||||||
\sigma_x^2 & \sigma_{xy} \\
|
|
||||||
\sigma_{xy} & \sigma_y^2
|
|
||||||
\end{pmatrix}
|
|
||||||
$$
|
|
||||||
|
|
||||||
This is not truly a discriminant but rather a specific choice of direction for
|
|
||||||
projection of the data down to one dimension: the projected data can then be
|
|
||||||
used to construct a discriminant by choosing a threshold for the
|
used to construct a discriminant by choosing a threshold for the
|
||||||
classification.
|
classification.
|
||||||
|
|
||||||
When implemented, the parameters given in @sec:sampling were used to compute
|
When implemented, the parameters given in @sec:sampling were used to compute
|
||||||
the covariance matrices $S_1$ and $S_2$ of the two classes and their sum $S$.
|
the covariance matrices and their sum $\Sigma_w$. Then $\Sigma_w$, being a
|
||||||
Then $S$, being a symmetrical and positive-definite matrix, was inverted with
|
symmetrical and positive-definite matrix, was inverted with the Cholesky method,
|
||||||
the Cholesky method, already discussed in @sec:MLM.
|
already discussed in @sec:MLM. Lastly, the matrix-vector product was computed
|
||||||
Lastly, the matrix-vector product was computed with the `gsl_blas_dgemv()`
|
with the `gsl_blas_dgemv()` function provided by GSL.
|
||||||
function provided by GSL.
|
|
||||||
|
|
||||||
|
|
||||||
### The threshold
|
### The threshold
|
||||||
|
|
||||||
The cut was fixed by the condition of conditional probability being the same
|
The threshold $t_{\text{cut}}$ was fixed by the condition of conditional
|
||||||
for each class:
|
probability $P(c_k | t_{\text{cut}})$ being the same for both classes $c_k$:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
t_{\text{cut}} = x \, | \hspace{20pt}
|
t_{\text{cut}} = x \, | \hspace{20pt}
|
||||||
\frac{P(c_1 | x)}{P(c_2 | x)} =
|
\frac{P(c_1 | x)}{P(c_2 | x)} =
|
||||||
\frac{p(x | c_1) \, p(c_1)}{p(x | c_1) \, p(c_2)} = 1
|
\frac{P(x | c_1) \, P(c_1)}{P(x | c_1) \, P(c_2)} = 1
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where $p(x | c_k)$ is the probability for point $x$ along the Fisher projection
|
where $P(x | c_k)$ is the probability for point $x$ along the Fisher projection
|
||||||
line of belonging to the class $k$. If the classes are bivariate Gaussian, as
|
line of being sampled according to the class $k$. If each class is a bivariate
|
||||||
in the present case, then $p(x | c_k)$ is simply given by its projected normal
|
Gaussian, as in the present case, then $P(x | c_k)$ is simply given by its
|
||||||
distribution $\mathscr{G} (\hat{μ}, \hat{S})$. With a bit of math, the solution
|
projected normal distribution with mean $\hat{m} = w^T m$ and variance $\hat{s}
|
||||||
is then:
|
= w^T S w$, being $S$ the covariance matrix of the class.
|
||||||
|
With a bit of math, the following solution can be found:
|
||||||
$$
|
$$
|
||||||
t = \frac{b}{a} + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
|
t_{\text{cut}} = \frac{b}{a}
|
||||||
|
+ \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where:
|
where:
|
||||||
|
|
||||||
- $a = \hat{S}_1^2 - \hat{S}_2^2$
|
- $a = \hat{s}_1^2 - \hat{s}_2^2$
|
||||||
- $b = \hat{m}_2 \, \hat{S}_1^2 - \hat{M}_1 \, \hat{S}_2^2$
|
- $b = \hat{\mu}_2 \, \hat{s}_1^2 - \hat{\mu}_1 \, \hat{s}_2^2$
|
||||||
- $c = \hat{M}_2^2 \, \hat{S}_1^2 - \hat{M}_1^2 \, \hat{S}_2^2
|
- $c = \hat{\mu}_2^2 \, \hat{s}_1^2 - \hat{\mu}_1^2 \, \hat{s}_2^2
|
||||||
- 2 \, \hat{S}_1^2 \, \hat{S}_2^2 \, \ln(\alpha)$
|
- 2 \, \hat{s}_1^2 \, \hat{s}_2^2 \, \ln(\alpha)$
|
||||||
- $\alpha = p(c_1) / p(c_2)$
|
- $\alpha = P(c_1) / P(c_2)$
|
||||||
|
|
||||||
The ratio of the prior probability $\alpha$ was computed as:
|
|
||||||
|
|
||||||
|
The ratio of the prior probabilities $\alpha$ is simply given by:
|
||||||
$$
|
$$
|
||||||
\alpha = \frac{N_s}{N_n}
|
\alpha = \frac{N_s}{N_n}
|
||||||
$$
|
$$
|
||||||
|
|
||||||
The projection of the points was accomplished by the use of the function
|
The projection of the points was accomplished by the use of the function
|
||||||
`gsl_blas_ddot()`, which computed a dot product between two vectors, which in
|
`gsl_blas_ddot()`, which computes the element wise product between two vectors.
|
||||||
this case were the weight vector and the position of the point to be projected.
|
|
||||||
|
Results obtained for the same samples in @fig:points are shown in
|
||||||
|
@fig:fisher_proj. The weight vector and the treshold were found to be:
|
||||||
|
$$
|
||||||
|
w = (0.707, 0.707) \et
|
||||||
|
t_{\text{cut}} = 1.323
|
||||||
|
$$
|
||||||
|
|
||||||
<div id="fig:fisher_proj">
|
<div id="fig:fisher_proj">
|
||||||
![View from above of the samples.](images/7-fisher-plane.pdf){height=5.7cm}
|
![View of the samples in the plane.](images/7-fisher-plane.pdf)
|
||||||
![Gaussian of the samples on the projection
|
![View of the samples projections onto the projection
|
||||||
line.](images/7-fisher-proj.pdf){height=5.7cm}
|
line.](images/7-fisher-proj.pdf)
|
||||||
|
|
||||||
Aerial and lateral views of the projection direction, in blue, and the cut, in
|
Aerial and lateral views of the samples. Projection line in blu and cut in red.
|
||||||
red.
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
Results obtained for the same sample in @fig:points are shown in
|
Since the vector $w$ turned out to be parallel with the line joining the means
|
||||||
@fig:fisher_proj. The weight vector $w$ was found to be:
|
of the two classes (reminded to be $(0, 0)$ and $(4, 4)$), one can be mislead
|
||||||
|
and assume that the inverse of the total covariance matrix $\Sigma_w$ is
|
||||||
|
isotropic, namely proportional to the unit matrix.
|
||||||
|
That's not true. In this special sample, the vector joining the means turns out
|
||||||
|
to be an eigenvector of the covariance matrix $\Sigma_w^{-1}$. In fact: since
|
||||||
|
$\sigma_x = \sigma_y$ for both signal and noise:
|
||||||
$$
|
$$
|
||||||
w = (0.707, 0.707)
|
\Sigma_1 = \begin{pmatrix}
|
||||||
|
\sigma_x^2 & \sigma_{xy} \\
|
||||||
|
\sigma_{xy} & \sigma_x^2
|
||||||
|
\end{pmatrix}_1
|
||||||
|
\et
|
||||||
|
\Sigma_2 = \begin{pmatrix}
|
||||||
|
\sigma_x^2 & \sigma_{xy} \\
|
||||||
|
\sigma_{xy} & \sigma_x^2
|
||||||
|
\end{pmatrix}_2
|
||||||
$$
|
$$
|
||||||
|
|
||||||
and $t_{\text{cut}}$ is 1.323 far from the origin of the axes. Hence, as can be
|
$\Sigma_w$ takes the form:
|
||||||
seen, the vector $w$ turned out to be parallel to the line joining the means of
|
$$
|
||||||
the two classes (reminded to be $(0, 0)$ and $(4, 4)$) which means that the
|
\Sigma_w = \begin{pmatrix}
|
||||||
total covariance matrix $S$ is isotropic, proportional to the unit matrix.
|
A & B \\
|
||||||
|
B & A
|
||||||
|
\end{pmatrix}
|
||||||
|
$$
|
||||||
|
|
||||||
|
Which can be easily inverted by Gaussian elimination:
|
||||||
|
\begin{align*}
|
||||||
|
\begin{pmatrix}
|
||||||
|
A & B & \vline & 1 & 0 \\
|
||||||
|
B & A & \vline & 0 & 1 \\
|
||||||
|
\end{pmatrix} &\longrightarrow
|
||||||
|
\begin{pmatrix}
|
||||||
|
A - B & 0 & \vline & 1 - B & - B \\
|
||||||
|
0 & A - B & \vline & - B & 1 - B \\
|
||||||
|
\end{pmatrix} \\ &\longrightarrow
|
||||||
|
\begin{pmatrix}
|
||||||
|
1 & 0 & \vline & (1 - B)/(A - B) & - B/(A - B) \\
|
||||||
|
0 & 1 & \vline & - B/(A - B) & (1 - B)/(A - B) \\
|
||||||
|
\end{pmatrix}
|
||||||
|
\end{align*}
|
||||||
|
|
||||||
|
Hence:
|
||||||
|
$$
|
||||||
|
\Sigma_w^{-1} = \begin{pmatrix}
|
||||||
|
\tilde{A} & \tilde{B} \\
|
||||||
|
\tilde{B} & \tilde{A}
|
||||||
|
\end{pmatrix}
|
||||||
|
$$
|
||||||
|
|
||||||
|
Thus, $\Sigma_w$ and $\Sigma_w^{-1}$ share the same eigenvectors $v_1$ and
|
||||||
|
$v_2$:
|
||||||
|
$$
|
||||||
|
v_1 = \begin{pmatrix}
|
||||||
|
1 \\
|
||||||
|
-1
|
||||||
|
\end{pmatrix} \et
|
||||||
|
v_2 = \begin{pmatrix}
|
||||||
|
1 \\
|
||||||
|
1
|
||||||
|
\end{pmatrix}
|
||||||
|
$$
|
||||||
|
|
||||||
|
and the vector joining the means is clearly a multiple of $v_2$, causing $w$ to
|
||||||
|
be a multiple of it.
|
||||||
|
|
||||||
|
|
||||||
## Perceptron
|
## Perceptron
|
||||||
|
|
||||||
In machine learning, the perceptron is an algorithm for supervised learning of
|
In machine learning, the perceptron is an algorithm for supervised learning of
|
||||||
linear binary classifiers.
|
linear binary classifiers.
|
||||||
|
|
||||||
Supervised learning is the machine learning task of inferring a function $f$
|
Supervised learning is the machine learning task of inferring a function $f$
|
||||||
that maps an input $x$ to an output $f(x)$ based on a set of training
|
that maps an input $x$ to an output $f(x)$ based on a set of training
|
||||||
input-output pairs. Each example is a pair consisting of an input object and an
|
input-output pairs, where each pair consists of an input object and an output
|
||||||
output value. The inferred function can be used for mapping new examples. The
|
value. The inferred function can be used for mapping new examples: the algorithm
|
||||||
algorithm will be generalized to correctly determine the class labels for unseen
|
is generalized to correctly determine the class labels for unseen instances.
|
||||||
instances.
|
|
||||||
|
|
||||||
The aim is to determine the bias $b$ such that the threshold function $f(x)$:
|
|
||||||
|
|
||||||
|
The aim of the perceptron algorithm is to determine the weight vector $w$ and
|
||||||
|
bias $b$ such that the so-called 'threshold function' $f(x)$ returns a binary
|
||||||
|
value: it is expected to return 1 for signal points and 0 for noise points:
|
||||||
$$
|
$$
|
||||||
f(x) = x \cdot w + b \hspace{20pt}
|
f(x) = \theta(w^T \cdot x + b)
|
||||||
\begin{cases}
|
|
||||||
\geqslant 0 \incase x \in \text{signal} \\
|
|
||||||
< 0 \incase x \in \text{noise}
|
|
||||||
\end{cases}
|
|
||||||
$$ {#eq:perc}
|
$$ {#eq:perc}
|
||||||
|
|
||||||
The training was performed as follow. Initial values were set as $w = (0,0)$ and
|
where $\theta$ is the Heaviside theta function.
|
||||||
$b = 0$. From these, the perceptron starts to improve their estimations. The
|
The training was performed using the generated sample as training set. From an
|
||||||
sample was passed point by point into a iterative procedure a grand total of
|
initial guess for $w$ and $b$ (which were set to be all null in the code), the
|
||||||
$N_c$ calls: each time, the projection $w \cdot x$ of the point was computed
|
perceptron starts to improve their estimations. The training set is passed point
|
||||||
and then the variable $\Delta$ was defined as:
|
by point into a iterative procedure a customizable number $N$ of times: for
|
||||||
|
every point, the output of $f(x)$ is computed. Afterwards, the variable
|
||||||
|
$\Delta$, which is defined as:
|
||||||
$$
|
$$
|
||||||
\Delta = r * (e - \theta (f(x))
|
\Delta = r [e - f(x)]
|
||||||
$$
|
$$
|
||||||
|
|
||||||
where:
|
where:
|
||||||
|
|
||||||
- $r$ is the learning rate of the perceptron: it is between 0 and 1. The
|
- $r \in [0, 1]$ is the learning rate of the perceptron: the larger $r$, the
|
||||||
larger $r$, the more volatile the weight changes. In the code, it was set
|
more volatile the weight changes. In the code it was arbitrarily set $r =
|
||||||
$r = 0.8$;
|
0.8$;
|
||||||
- $e$ is the expected value, namely 0 if $x$ is noise and 1 if it is signal;
|
- $e$ is the expected output value, namely 1 if $x$ is signal and 0 if it is
|
||||||
- $\theta$ is the Heaviside theta function;
|
noise;
|
||||||
- $o$ is the observed value of $f(x)$ defined in @eq:perc.
|
|
||||||
|
|
||||||
Then $b$ and $w$ must be updated as:
|
is used to update $b$ and $w$:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
b \to b + \Delta
|
b \to b + \Delta
|
||||||
\et
|
\et
|
||||||
w \to w + x \Delta
|
w \to w + \Delta x
|
||||||
$$
|
$$
|
||||||
|
|
||||||
<div id="fig:percep_proj">
|
To see how it works, consider the four possible situations:
|
||||||
![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
|
|
||||||
![Gaussian of the samples on the projection
|
|
||||||
line.](images/7-percep-proj.pdf){height=5.7cm}
|
|
||||||
|
|
||||||
Aerial and lateral views of the projection direction, in blue, and the cut, in
|
- $e = 1 \quad \wedge \quad f(x) = 1 \quad \dot \vee \quad e = 0 \quad \wedge
|
||||||
red.
|
\quad f(x) = 0 \quad \Longrightarrow \quad \Delta = 0$
|
||||||
</div>
|
the current estimations work properly: $b$ and $w$ do not need to be updated;
|
||||||
|
- $e = 1 \quad \wedge \quad f(x) = 0 \quad \Longrightarrow \quad
|
||||||
|
\Delta = 1$
|
||||||
|
the current $b$ and $w$ underestimate the correct output: they must be
|
||||||
|
increased;
|
||||||
|
- $e = 0 \quad \wedge \quad f(x) = 1 \quad \Longrightarrow \quad
|
||||||
|
\Delta = -1$
|
||||||
|
the current $b$ and $w$ overestimate the correct output: they must be
|
||||||
|
decreased.
|
||||||
|
|
||||||
It can be shown that this method converges to the coveted function.
|
Whilst the $b$ updating is obvious, as regarsd $w$ the following consideration
|
||||||
As stated in the previous section, the weight vector must finally be normalized.
|
may help clarify. Consider the case with $e = 0 \quad \wedge \quad f(x) = 1
|
||||||
|
\quad \Longrightarrow \quad \Delta = -1$:
|
||||||
|
$$
|
||||||
|
w^T \cdot x \to (w^T + \Delta x^T) \cdot x
|
||||||
|
= w^T \cdot x + \Delta |x|^2
|
||||||
|
= w^T \cdot x - |x|^2 \leq w^T \cdot x
|
||||||
|
$$
|
||||||
|
|
||||||
With $N_c = 5$, the values of $w$ and $t_{\text{cut}}$ level off up to the third
|
Similarly for the case with $e = 1$ and $f(x) = 0$.
|
||||||
|
|
||||||
|
As far as convergence is concerned, the perceptron will never get to the state
|
||||||
|
with all the input points classified correctly if the training set is not
|
||||||
|
linearly separable, meaning that the signal cannot be separated from the noise
|
||||||
|
by a line in the plane. In this case, no approximate solutions will be gradually
|
||||||
|
approached. On the other hand, if the training set is linearly separable, it can
|
||||||
|
be shown that this method converges to the coveted function [@novikoff63].
|
||||||
|
As in the previous section, once found, the weight vector is to be normalized.
|
||||||
|
|
||||||
|
With $N = 5$ iterations, the values of $w$ and $t_{\text{cut}}$ level off up to the third
|
||||||
digit. The following results were obtained:
|
digit. The following results were obtained:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
@ -287,7 +385,16 @@ where, once again, $t_{\text{cut}}$ is computed from the origin of the axes. In
|
|||||||
this case, the projection line does not lies along the mains of the two
|
this case, the projection line does not lies along the mains of the two
|
||||||
samples. Plots in @fig:percep_proj.
|
samples. Plots in @fig:percep_proj.
|
||||||
|
|
||||||
## Efficiency test
|
<div id="fig:percep_proj">
|
||||||
|
![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
|
||||||
|
![Gaussian of the samples on the projection
|
||||||
|
line.](images/7-percep-proj.pdf){height=5.7cm}
|
||||||
|
|
||||||
|
Aerial and lateral views of the projection direction, in blue, and the cut, in
|
||||||
|
red.
|
||||||
|
</div>
|
||||||
|
|
||||||
|
## Efficiency test {#sec:7_results}
|
||||||
|
|
||||||
A program was implemented to check the validity of the two
|
A program was implemented to check the validity of the two
|
||||||
classification methods.
|
classification methods.
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
- riscrivere il readme di: 1, 2, 6
|
- riscrivere il readme di: 1, 2, 6
|
||||||
- rileggere il 7
|
- rileggere il 7
|
||||||
- completare il 2
|
- completare il 2
|
||||||
|
- riscrivere il 2
|
||||||
|
|
||||||
|
On the lambert W function, formula 4.19 Corless
|
||||||
|
Loading…
Reference in New Issue
Block a user