ex-7: revised and typo-fixed

In addition, the folder ex-7/iters was created in order to plot the results of the Perceptron method as a function of the iterations parameter.
2020-05-24 12:01:36 +02:00 · 2020-05-24 12:01:36 +02:00 · 7ad45a829e
commit 7ad45a829e
parent 90ab77b676
11 changed files with 332 additions and 162 deletions
--- a/ex-7/fisher.c
+++ b/ex-7/fisher.c
@ -39,9 +39,9 @@ gsl_vector* normal_mean(struct par *p) {
 * between the two classes.
 * The projection vector w is given by
 *
- *   w = Sw⁻¹ (μ₂ - μ₁)
+ *   w = Σ_w⁻¹ (μ₂ - μ₁)
 * 
- * where Sw = Σ₁ + Σ₂ is the so-called within-class
+ * where Σ_w = Σ₁ + Σ₂ is the so-called within-class
 * covariance matrix.
 */
 gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
@ -54,10 +54,11 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
  gsl_vector *mu2 = normal_mean(&c2->p);

  /* Compute the inverse of the within-class
-   * covariance Sw⁻¹.
+   * covariance Σ_w⁻¹.
   * Note: by definition Σ is symmetrical and
   * positive-definite, so Cholesky is appropriate.
   */
+
  gsl_matrix_add(cov1, cov2);
  gsl_linalg_cholesky_decomp(cov1);
  gsl_linalg_cholesky_invert(cov1);
@ -67,7 +68,7 @@ gsl_vector* fisher_proj(sample_t *c1, sample_t *c2) {
  gsl_vector_memcpy(diff, mu2);
  gsl_vector_sub(diff, mu1);

-  /* Finally multiply diff by Sw.
+  /* Finally multiply diff by Σ_w.
   * This uses the rather low-level CBLAS
   * functions gsl_blas_dgemv:
   *
--- a/ex-7/iters/iter.py
+++ b/ex-7/iters/iter.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+iter, w_x, w_y, b = np.loadtxt('ex-7/iters/iters.txt')
+
+plt.figure(figsize=(5, 4))
+plt.rcParams['font.size'] = 8
+
+plt.subplot(211)
+plt.title('weight vector', loc='right')
+plt.plot(iter, w, color='#92182b')
+plt.ylabel('w')
+plt.xlabel('N')
+
+plt.subplot(212)
+plt.title('bias', loc='right')
+plt.plot(iter, b, color='gray')
+plt.ylabel('b')
+plt.xlabel('N')
+
+plt.tight_layout()
+plt.show()
--- a/ex-7/iters/iters.txt
+++ b/ex-7/iters/iters.txt
@ -0,0 +1,11 @@
+#iters  w_x   w_y b  b
+ 1      0.427 0.904 1.749
+ 2      0.566 0.824 1.445
+ 3      0.654 0.756 1.213
+ 4      0.654 0.756 1.213
+ 5      0.654 0.756 1.213
+ 7      0.654 0.756 1.213
+ 6      0.654 0.756 1.213
+ 8      0.654 0.756 1.213
+ 9      0.654 0.756 1.213
+10      0.654 0.756 1.213
--- a/ex-7/main.c
+++ b/ex-7/main.c
@ -95,7 +95,7 @@ int main(int argc, char **argv) {
     * dataset to get an approximate
     * solution in `iter` iterations.
     */
-    fputs("# Perceptron \n\n", stderr);
+  //  fputs("# Perceptron \n\n", stderr);
    w = percep_train(signal, noise, opts.iter, &cut);
  }
  else {
@ -107,20 +107,21 @@ int main(int argc, char **argv) {
  /* Print the results of the method
   * selected: weights and threshold.
   */
+  fprintf(stderr, "\n* i: %d\n", opts.iter);
  fprintf(stderr, "* w: [%.3f, %.3f]\n",
      gsl_vector_get(w, 0),
      gsl_vector_get(w, 1));
  fprintf(stderr, "* cut: %.3f\n", cut);
-  gsl_vector_fprintf(stdout, w, "%g");
-  printf("%f\n", cut);
+//  gsl_vector_fprintf(stdout, w, "%g");
+//  printf("%f\n", cut);

  /* Print data to stdout for plotting.
   * Note: we print the sizes to be able
   * to set apart the two matrices.
   */
-  printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
-  gsl_matrix_fprintf(stdout, signal->data, "%g");
-  gsl_matrix_fprintf(stdout, noise->data,  "%g");
+//  printf("%ld %ld %d\n", opts.nsig, opts.nnoise, 2);
+//  gsl_matrix_fprintf(stdout, signal->data, "%g");
+//  gsl_matrix_fprintf(stdout, noise->data,  "%g");

  // free memory
  gsl_rng_free(r);
--- a/ex-7/plot.py
+++ b/ex-7/plot.py
@ -7,7 +7,6 @@ def line(x, y, **args):
    '''line between two points x,y'''
    plot([x[0], y[0]], [x[1], y[1]], **args)

-rcParams['font.size'] = 12
 w = loadtxt(sys.stdin, max_rows=2)
 v = array([[0, -1], [1, 0]]) @ w
 cut = float(input())
@ -16,7 +15,11 @@ n, m, d = map(int, input().split())
 data  = loadtxt(sys.stdin).reshape(n + m, d)
 signal, noise = data[:n].T, data[n:].T

+plt.figure(figsize=(3, 3))
+rcParams['font.size'] = 8
 figure()
+figure(figsize=(3, 3))
+rcParams['font.size'] = 8
 subplot(aspect='equal')
 scatter(*signal, edgecolor='xkcd:charcoal',
        c='xkcd:dark yellow', label='signal')
@ -24,18 +27,22 @@ scatter(*noise,  edgecolor='xkcd:charcoal',
        c='xkcd:pale purple', label='noise')
 line(-20*w, 20*w, c='xkcd:midnight blue', label='projection')
 line(w-10*v, w+10*v, c='xkcd:scarlet', label='cut')
+xlabel('x')
+ylabel('y')
 xlim(-1.5, 8)
 ylim(-1.5, 8)
 legend()
 tight_layout()
+savefig('notes/images/7-fisher-plane.pdf')

-figure()
+plt.figure(figsize=(3, 3))
+rcParams['font.size'] = 8
 sig_proj   = np.dot(w, signal)
 noise_proj = np.dot(w, noise)
 hist(sig_proj,   color='xkcd:dark yellow', label='signal')
 hist(noise_proj, color='xkcd:pale purple', label='noise')
-axvline(cut, c='xkcd:scarlet')
+axvline(cut, c='xkcd:scarlet', label='cut')
+xlabel('projection line')
 legend()
 tight_layout()
-
-show()
+savefig('notes/images/7-fisher-proj.pdf')
--- a/notes/docs/bibliography.bib
+++ b/notes/docs/bibliography.bib
@ -140,17 +140,32 @@

@book{hecht02,
  title={Optics},
+  author={Eugene Hecht},
  year={2002},
-  publisher={Pearson},
-  author={Eugene Hecht}
+  publisher={Pearson}
 }

@article{lucy74,
  title={An iterative technique for the rectification of observed
         distributions},
-  author={Lucy, Leon B},
+  author={Lucy, Leon B.},
  journal={The astronomical journal},
  volume={79},
  pages={745},
  year={1974}
 }
+
+@book{bishop06,
+  title={Pattern Recognition and Machine Learning},
+  author={Bishop, Christopher M.},
+  year={2006},
+  pages={186 -- 189},
+  publisher={Springer}
+}
+
+@techreport{novikoff63,
+  title={On convergence proofs for perceptrons},
+  author={Novikoff, Albert B},
+  year={1963},
+  institution={Stanford Researhc INST Menlo Park CA}
+}
--- a/notes/images/7-fisher-plane.pdf
+++ b/notes/images/7-fisher-plane.pdf
--- a/notes/images/7-fisher-proj.pdf
+++ b/notes/images/7-fisher-proj.pdf
--- a/notes/images/7-points.pdf
+++ b/notes/images/7-points.pdf
--- a/notes/sections/7.md
+++ b/notes/sections/7.md
@ -2,9 +2,8 @@

 ## Generating points according to Gaussian distributions {#sec:sampling}

-The first task of exercise 7 is to generate two sets of 2D points $(x, y)$
-according to two bivariate Gaussian distributions with parameters:
-
+Two sets of 2D points $(x, y)$ - signal and noise - is to be generated according
+to two bivariate Gaussian distributions with parameters:
 $$
 \text{signal} \quad
 \begin{cases}
@ -21,262 +20,361 @@ $$
 \end{cases}
 $$

-where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ are the standard
+where $\mu$ stands for the mean, $\sigma_x$ and $\sigma_y$ for the standard
 deviations in $x$ and $y$ directions respectively and $\rho$ is the bivariate
-correlation, hence:
-
+correlation, namely:
 $$
  \sigma_{xy} = \rho \sigma_x \sigma_y
 $$

 where $\sigma_{xy}$ is the covariance of $x$ and $y$.  
 In the code, default settings are $N_s = 800$ points for the signal and $N_n =
-1000$ points for the noise but can be changed from the command-line. Both
-samples were handled as matrices of dimension $n$ x 2, where $n$ is the number
-of points in the sample. The library `gsl_matrix` provided by GSL was employed
-for this purpose and the function `gsl_ran_bivariate_gaussian()` was used for
-generating the points.  
+1000$ points for the noise but can be customized from the input command-line.
+Both samples were handled as matrices of dimension $n$ x 2, where $n$ is the
+number of points in the sample. The library `gsl_matrix` provided by GSL was
+employed for this purpose and the function `gsl_ran_bivariate_gaussian()` was
+used for generating the points.  
 An example of the two samples is shown in @fig:points.

-![Example of points sorted according to two Gaussian with
-the given parameters. Noise points in pink and signal points
-in yellow.](images/7-points.pdf){#fig:points}
+![Example of points sampled according to the two Gaussian distributions
+with the given parameters.](images/7-points.pdf){#fig:points}

 Assuming not to know how the points were generated, a model of classification
-must then be implemented in order to assign each point to the right class
+is then to be implemented in order to assign each point to the right class
 (signal or noise) to which it 'most probably' belongs to. The point is how
-'most probably' can be interpreted and implemented.
+'most probably' can be interpreted and implemented.  
+Here, the Fisher linear discriminant and the Perceptron were implemented and
+described in the following two sections. The results are compared in
+@sec:7_results.
+

 ## Fisher linear discriminant

+
 ### The projection direction

 The Fisher linear discriminant (FLD) is a linear classification model based on
 dimensionality reduction. It allows to reduce this 2D classification problem
 into a one-dimensional decision surface.

-Consider the case of two classes (in this case the signal and the noise): the
-simplest representation of a linear discriminant is obtained by taking a linear
-function of a sampled 2D point $x$ so that:
-
+Consider the case of two classes (in this case signal and noise): the simplest
+representation of a linear discriminant is obtained by taking a linear function
+$\hat{x}$ of a sampled 2D point $x$ so that:
 $$
  \hat{x} = w^T x
 $$

-where $w$ is the so-called 'weight vector'. An input point $x$ is commonly
-assigned to the first class if $\hat{x} \geqslant w_{th}$ and to the second one
-otherwise, where $w_{th}$ is a threshold value somehow defined.  
-In general, the projection onto one dimension leads to a considerable loss of
-information and classes that are well separated in the original 2D space may
-become strongly overlapping in one dimension. However, by adjusting the
-components of the weight vector, a projection that maximizes the classes
-separation can be selected.  
+where $w$ is the so-called 'weight vector' and $w^T$ stands for its transpose.
+An input point $x$ is commonly assigned to the first class if $\hat{x} \geqslant
+w_{th}$ and to the second one otherwise, where $w_{th}$ is a threshold value
+somehow defined.   In general, the projection onto one dimension leads to a
+considerable loss of information and classes that are well separated in the
+original 2D space may become strongly overlapping in one dimension. However, by
+adjusting the components of the weight vector, a projection that maximizes the
+classes separation can be selected [@bishop06].  
 To begin with, consider $N_1$ points of class $C_1$ and $N_2$ points of class
-$C_2$, so that the means $m_1$ and $m_2$ of the two classes are given by:
-
+$C_2$, so that the means $\mu_1$ and $\mu_2$ of the two classes are given by:
 $$
-  m_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
+  \mu_1 = \frac{1}{N_1} \sum_{n \in C_1} x_n
  \et
-  m_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
+  \mu_2 = \frac{1}{N_2} \sum_{n \in C_2} x_n
 $$

 The simplest measure of the separation of the classes is the separation of the
 projected class means. This suggests to choose $w$ so as to maximize:
-
 $$
-  \hat{m}_2 − \hat{m}_1 = w^T (m_2 − m_1)
+  \hat{\mu}_2 − \hat{\mu}_1 = w^T (\mu_2 − \mu_1)
 $$

 This expression can be made arbitrarily large simply by increasing the magnitude
 of $w$. To solve this problem, $w$ can be constrained to have unit length, so
 that $| w^2 | = 1$. Using a Lagrange multiplier to perform the constrained
-maximization, it can be found that $w \propto (m_2 − m_1)$.
-
-![The plot on the left shows samples from two classes along with the histograms
-resulting from projection onto the line joining the class means: note that
-there is considerable overlap in the projected space. The right plot shows the
-corresponding projection based on the Fisher linear discriminant, showing the
-greatly improved classes separation.](images/7-fisher.png){#fig:overlap}
-
+maximization, it can be found that $w \propto (\mu_2 − \mu_1)$, meaning that the
+line onto the points must be projected is the one joining the class means.  
 There is still a problem with this approach, however, as illustrated in
@fig:overlap: the two classes are well separated in the original 2D space but
-have considerable overlap when projected onto the line joining their means.  
+have considerable overlap when projected onto the line joining their means
+which maximize their projections distance.
+
+![The plot on the left shows samples from two classes along with the
+histograms resulting fromthe  projection onto the line joining the
+class means: note that there is considerable overlap in the projected
+space. The right plot shows the corresponding projection based on the
+Fisher linear discriminant, showing the greatly improved classes
+separation. Fifure from [@bishop06]](images/7-fisher.png){#fig:overlap}
+
 The idea to solve it is to maximize a function that will give a large separation
 between the projected classes means while also giving a small variance within
 each class, thereby minimizing the class overlap.  
-The within-classes variance of the transformed data of each class $k$ is given
+The within-class variance of the transformed data of each class $k$ is given
 by:
-
 $$
-  s_k^2 = \sum_{n \in C_k} (\hat{x}_n - \hat{m}_k)^2
+  \hat{s}_k^2 = \sum_{n \in c_k} (\hat{x}_n - \hat{\mu}_k)^2
 $$

-The total within-classes variance for the whole data set can be simply defined
-as $s^2 = s_1^2 + s_2^2$. The Fisher criterion is therefore defined to be the
-ratio of the between-classes distance to the within-classes variance and is
+The total within-class variance for the whole data set is simply defined as
+$\hat{s}^2 = \hat{s}_1^2 + \hat{s}_2^2$. The Fisher criterion is defined to
+be the ratio of the between-class distance to the within-class variance and is
 given by:
-
 $$
-  J(w) = \frac{(\hat{m}_2 - \hat{m}_1)^2}{s^2}
+  F(w) = \frac{(\hat{\mu}_2 - \hat{\mu}_1)^2}{\hat{s}^2}
 $$

-Differentiating $J(w)$ with respect to $w$, it can be found that it is
-maximized when:
+The dependence on $w$ can be made explicit:
+\begin{align*}
+  (\hat{\mu}_2 - \hat{\mu}_1)^2 &= (w^T \mu_2 -  w^T \mu_1)^2             \\
+                            &= [w^T (\mu_2 - \mu_1)]^2                    \\
+                            &= [w^T (\mu_2 - \mu_1)][w^T (\mu_2 - \mu_1)] \\
+                            &= [w^T (\mu_2 - \mu_1)][(\mu_2 - \mu_1)^T w]
+                             = w^T M w
+\end{align*}

+where $M$ is the between-distance matrix. Similarly, as regards the denominator:
+\begin{align*}
+  \hat{s}^2 &= \hat{s}_1^2 + \hat{s}_2^2 = \\
+            &= \sum_{n \in c_1} (\hat{x}_n - \hat{\mu}_1)^2
+             + \sum_{n \in c_2} (\hat{x}_n - \hat{\mu}_2)^2
+             = w^T \Sigma_w w
+\end{align*}
+
+where $\Sigma_w$ is the total within-class covariance matrix:
+\begin{align*}
+  \Sigma_w &= \sum_{n \in c_1} (x_n − \mu_1)(x_n − \mu_1)^T
+            + \sum_{n \in c_2} (x_n − \mu_2)(x_n − \mu_2)^T \\
+           &= \Sigma_1 + \Sigma_2
+            = \begin{pmatrix}
+              \sigma_x^2  & \sigma_{xy} \\
+              \sigma_{xy} & \sigma_y^2
+              \end{pmatrix}_1 +
+              \begin{pmatrix}
+              \sigma_x^2  & \sigma_{xy} \\
+              \sigma_{xy} & \sigma_y^2
+              \end{pmatrix}_2
+\end{align*}
+
+Where $\Sigma_1$ and $\Sigma_2$ are the covariance matrix of the two samples.
+The Fisher criterion can therefore be rewritten in the form:
 $$
-  w = S_b^{-1} (m_2 - m_1)
+  F(w) = \frac{w^T M w}{w^T \Sigma_w w}
 $$

-where $S_b$ is the covariance matrix, given by:
-
+Differentiating with respect to $w$, it can be found that $F(w)$ is maximized
+when:
 $$
-  S_b = S_1 + S_2
+  w = \Sigma_w^{-1} (\mu_2 - \mu_1)
 $$

-where $S_1$ and $S_2$ are the covariance matrix of the two classes, namely:
-
-$$
-\begin{pmatrix}
-\sigma_x^2  & \sigma_{xy} \\
-\sigma_{xy} & \sigma_y^2
-\end{pmatrix}
-$$
-
-This is not truly a discriminant but rather a specific choice of direction for
-projection of the data down to one dimension: the projected data can then be
+This is not truly a discriminant but rather a specific choice of the direction
+for projection of the data down to one dimension: the projected data can then be
 used to construct a discriminant by choosing a threshold for the
 classification.

 When implemented, the parameters given in @sec:sampling were used to compute
-the covariance matrices $S_1$ and $S_2$ of the two classes and their sum $S$.
-Then $S$, being a symmetrical and positive-definite matrix, was inverted with
-the Cholesky method, already discussed in @sec:MLM.
-Lastly, the matrix-vector product was computed with the `gsl_blas_dgemv()`
-function provided by GSL.
+the covariance matrices and their sum $\Sigma_w$. Then $\Sigma_w$, being a
+symmetrical and positive-definite matrix, was inverted with the Cholesky method,
+already discussed in @sec:MLM. Lastly, the matrix-vector product was computed
+with the `gsl_blas_dgemv()` function provided by GSL.


 ### The threshold

-The cut was fixed by the condition of conditional probability being the same
-for each class:
-
+The threshold $t_{\text{cut}}$ was fixed by the condition of conditional
+probability $P(c_k | t_{\text{cut}})$ being the same for both classes $c_k$:
 $$
  t_{\text{cut}} = x \, | \hspace{20pt}
 \frac{P(c_1 | x)}{P(c_2 | x)} = 
- \frac{p(x | c_1) \, p(c_1)}{p(x | c_1) \, p(c_2)} = 1
+ \frac{P(x | c_1) \, P(c_1)}{P(x | c_1) \, P(c_2)} = 1
 $$

-where $p(x | c_k)$ is the probability for point $x$ along the Fisher projection
-line of belonging to the class $k$. If the classes are bivariate Gaussian, as
-in the present case, then $p(x | c_k)$ is simply given by its projected normal
-distribution $\mathscr{G} (\hat{μ}, \hat{S})$. With a bit of math, the solution
-is then:
-
+where $P(x | c_k)$ is the probability for point $x$ along the Fisher projection
+line of being sampled according to the class $k$. If each class is a bivariate
+Gaussian, as in the present case, then $P(x | c_k)$ is simply given by its
+projected normal distribution with mean $\hat{m} = w^T m$ and variance $\hat{s}
+= w^T S w$, being $S$ the covariance matrix of the class.  
+With a bit of math, the following solution can be found:
 $$
-  t = \frac{b}{a} + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
+  t_{\text{cut}} = \frac{b}{a}
+                 + \sqrt{\left( \frac{b}{a} \right)^2 - \frac{c}{a}}
 $$

 where:

-  - $a = \hat{S}_1^2 - \hat{S}_2^2$
-  - $b = \hat{m}_2 \, \hat{S}_1^2 - \hat{M}_1 \, \hat{S}_2^2$
-  - $c = \hat{M}_2^2 \, \hat{S}_1^2 - \hat{M}_1^2 \, \hat{S}_2^2
-       - 2 \, \hat{S}_1^2 \, \hat{S}_2^2 \, \ln(\alpha)$
-  - $\alpha = p(c_1) / p(c_2)$
-
-The ratio of the prior probability $\alpha$ was computed as:
+  - $a = \hat{s}_1^2 - \hat{s}_2^2$
+  - $b = \hat{\mu}_2 \, \hat{s}_1^2 - \hat{\mu}_1 \, \hat{s}_2^2$
+  - $c = \hat{\mu}_2^2 \, \hat{s}_1^2 - \hat{\mu}_1^2 \, \hat{s}_2^2
+       - 2 \, \hat{s}_1^2 \, \hat{s}_2^2 \, \ln(\alpha)$
+  - $\alpha = P(c_1) / P(c_2)$

+The ratio of the prior probabilities $\alpha$ is simply given by:
 $$
  \alpha = \frac{N_s}{N_n}
 $$

 The projection of the points was accomplished by the use of the function
-`gsl_blas_ddot()`, which computed a dot product between two vectors, which in
-this case were the weight vector and the position of the point to be projected.
+`gsl_blas_ddot()`, which computes the element wise product between two vectors.
+
+Results obtained for the same samples in @fig:points are shown in
+@fig:fisher_proj. The weight vector and the treshold were found to be:
+$$
+  w = (0.707, 0.707) \et
+  t_{\text{cut}} = 1.323
+$$

 <div id="fig:fisher_proj">
-![View from above of the samples.](images/7-fisher-plane.pdf){height=5.7cm}
-![Gaussian of the samples on the projection
-  line.](images/7-fisher-proj.pdf){height=5.7cm}
+![View of the samples in the plane.](images/7-fisher-plane.pdf)
+![View of the samples projections onto the projection
+  line.](images/7-fisher-proj.pdf)

-Aerial and lateral views of the projection direction, in blue, and the cut, in
-red.
+Aerial and lateral views of the samples. Projection line in blu and cut in red.
 </div>

-Results obtained for the same sample in @fig:points are shown in
-@fig:fisher_proj. The weight vector $w$ was found to be:
-
+Since the vector $w$ turned out to be parallel with the line joining the means
+of the two classes (reminded to be $(0, 0)$ and $(4, 4)$), one can be mislead
+and assume that the inverse of the total covariance matrix $\Sigma_w$ is
+isotropic, namely proportional to the unit matrix.  
+That's not true. In this special sample, the vector joining the means turns out
+to be an eigenvector of the covariance matrix $\Sigma_w^{-1}$. In fact: since
+$\sigma_x = \sigma_y$ for both signal and noise:
 $$
-  w = (0.707, 0.707)
+  \Sigma_1 = \begin{pmatrix}
+             \sigma_x^2  & \sigma_{xy} \\
+             \sigma_{xy} & \sigma_x^2
+             \end{pmatrix}_1
+  \et
+  \Sigma_2 = \begin{pmatrix}
+             \sigma_x^2  & \sigma_{xy} \\
+             \sigma_{xy} & \sigma_x^2
+             \end{pmatrix}_2
 $$

-and $t_{\text{cut}}$ is 1.323 far from the origin of the axes. Hence, as can be
-seen, the vector $w$ turned out to be parallel to the line joining the means of
-the two classes (reminded to be $(0, 0)$ and $(4, 4)$) which means that the 
-total covariance matrix $S$ is isotropic, proportional to the unit matrix.
+$\Sigma_w$ takes the form:
+$$
+  \Sigma_w = \begin{pmatrix}
+             A & B \\
+             B & A
+             \end{pmatrix}
+$$
+
+Which can be easily inverted by Gaussian elimination:
+\begin{align*}
+  \begin{pmatrix}
+  A & B & \vline & 1 & 0 \\
+  B & A & \vline & 0 & 1 \\
+  \end{pmatrix} &\longrightarrow
+  \begin{pmatrix}
+  A - B & 0     & \vline & 1 - B & - B   \\
+  0     & A - B & \vline & - B   & 1 - B \\
+  \end{pmatrix}  \\ &\longrightarrow
+  \begin{pmatrix}
+  1 & 0 & \vline & (1 - B)/(A - B) &     - B/(A - B) \\
+  0 & 1 & \vline &    -  B/(A - B) & (1 - B)/(A - B) \\
+  \end{pmatrix}
+\end{align*}
+
+Hence:
+$$
+  \Sigma_w^{-1} = \begin{pmatrix}
+                   \tilde{A} & \tilde{B} \\
+                   \tilde{B} & \tilde{A}
+                   \end{pmatrix}
+$$
+
+Thus, $\Sigma_w$ and $\Sigma_w^{-1}$ share the same eigenvectors $v_1$ and
+$v_2$:
+$$
+  v_1 = \begin{pmatrix}
+        1 \\
+        -1
+        \end{pmatrix} \et
+  v_2 = \begin{pmatrix}
+        1 \\
+        1
+        \end{pmatrix}
+$$
+
+and the vector joining the means is clearly a multiple of $v_2$, causing $w$ to
+be a multiple of it.


 ## Perceptron

 In machine learning, the perceptron is an algorithm for supervised learning of
-linear binary classifiers.  
+linear binary classifiers.
+
 Supervised learning is the machine learning task of inferring a function $f$
 that maps an input $x$ to an output $f(x)$ based on a set of training
-input-output pairs. Each example is a pair consisting of an input object and an
-output value. The inferred function can be used for mapping new examples. The
-algorithm will be generalized to correctly determine the class labels for unseen
-instances.
-
-The aim is to determine the bias $b$ such that the threshold function $f(x)$:
+input-output pairs, where each pair consists of an input object and an output
+value. The inferred function can be used for mapping new examples: the algorithm
+is generalized to correctly determine the class labels for unseen instances.

+The aim of the perceptron algorithm is to determine the weight vector $w$ and
+bias $b$ such that the so-called 'threshold function' $f(x)$ returns a binary
+value: it is expected to return 1 for signal points and 0 for noise points:
 $$
-  f(x) = x \cdot w + b \hspace{20pt}
-  \begin{cases}
-    \geqslant 0 \incase x \in \text{signal} \\
-    < 0         \incase x \in \text{noise}
-  \end{cases}
+  f(x) = \theta(w^T \cdot x + b)
 $$ {#eq:perc}

-The training was performed as follow. Initial values were set as $w = (0,0)$ and 
-$b = 0$. From  these, the perceptron starts to improve their estimations. The
-sample was passed point by point into a iterative procedure a grand total of
-$N_c$ calls: each time, the projection $w \cdot x$ of the point was computed
-and then the variable $\Delta$ was defined as:
-
+where $\theta$ is the Heaviside theta function.  
+The training was performed using the generated sample as training set. From an
+initial guess for $w$ and $b$ (which were set to be all null in the code), the
+perceptron starts to improve their estimations. The training set is passed point
+by point into a iterative procedure a customizable number $N$ of times: for
+every point, the output of $f(x)$ is computed. Afterwards, the variable
+$\Delta$, which is defined as:
 $$
-  \Delta = r * (e - \theta (f(x))
+  \Delta = r [e - f(x)]
 $$

 where:

-  - $r$ is the learning rate of the perceptron: it is between 0 and 1. The
-    larger $r$, the more volatile the weight changes. In the code, it was set
-    $r = 0.8$;
-  - $e$ is the expected value, namely 0 if $x$ is noise and 1 if it is signal;
-  - $\theta$ is the Heaviside theta function;
-  - $o$ is the observed value of $f(x)$ defined in @eq:perc.
+  - $r \in [0, 1]$ is the learning rate of the perceptron: the larger $r$, the
+    more volatile the weight changes. In the code it was arbitrarily set $r =
+    0.8$;
+  - $e$ is the expected output value, namely 1 if $x$ is signal and 0 if it is
+    noise;

-Then $b$ and $w$ must be updated as:
+is used to update $b$ and $w$:

 $$
  b \to b + \Delta
  \et
-  w \to w + x \Delta
+  w \to w + \Delta x
 $$

-<div id="fig:percep_proj">
-![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
-![Gaussian of the samples on the projection
-  line.](images/7-percep-proj.pdf){height=5.7cm}
+To see how it works, consider the four possible situations:

-Aerial and lateral views of the projection direction, in blue, and the cut, in
-red.
-</div>
+  - $e = 1 \quad \wedge \quad f(x) = 1 \quad \dot \vee \quad e = 0 \quad \wedge
+    \quad f(x) = 0  \quad \Longrightarrow \quad \Delta = 0$  
+    the current estimations work properly: $b$ and $w$ do not need to be updated;
+  - $e = 1 \quad \wedge \quad f(x) = 0 \quad \Longrightarrow \quad
+    \Delta = 1$  
+    the current $b$ and $w$ underestimate the correct output: they must be
+    increased;
+  - $e = 0 \quad \wedge \quad f(x) = 1 \quad \Longrightarrow \quad
+    \Delta = -1$  
+    the current $b$ and $w$ overestimate the correct output: they must be
+    decreased.

-It can be shown that this method converges to the coveted function.  
-As stated in the previous section, the weight vector must finally be normalized.
+Whilst the $b$ updating is obvious, as regarsd $w$ the following consideration
+may help clarify. Consider the case with $e = 0 \quad \wedge \quad f(x) = 1
+\quad \Longrightarrow \quad \Delta = -1$:
+$$
+  w^T \cdot x \to (w^T + \Delta x^T) \cdot x
+              = w^T \cdot x + \Delta |x|^2
+              = w^T \cdot x - |x|^2 \leq w^T \cdot x
+$$

-With $N_c = 5$, the values of $w$ and $t_{\text{cut}}$ level off up to the third
+Similarly for the case with $e = 1$ and $f(x) = 0$.
+
+As far as convergence is concerned, the perceptron will never get to the state
+with all the input points classified correctly if the training set is not
+linearly separable, meaning that the signal cannot be separated from the noise
+by a line in the plane. In this case, no approximate solutions will be gradually
+approached. On the other hand, if the training set is linearly separable, it can
+be shown that this method converges to the coveted function [@novikoff63].  
+As in the previous section, once found, the weight vector is to be normalized.
+
+With $N = 5$ iterations, the values of $w$ and $t_{\text{cut}}$ level off up to the third
 digit. The following results were obtained:

 $$
@ -287,7 +385,16 @@ where, once again, $t_{\text{cut}}$ is computed from the origin of the axes. In
 this case, the projection line does not lies along the mains of the two
 samples. Plots in @fig:percep_proj.

-## Efficiency test
+<div id="fig:percep_proj">
+![View from above of the samples.](images/7-percep-plane.pdf){height=5.7cm}
+![Gaussian of the samples on the projection
+  line.](images/7-percep-proj.pdf){height=5.7cm}
+
+Aerial and lateral views of the projection direction, in blue, and the cut, in
+red.
+</div>
+
+## Efficiency test {#sec:7_results}

 A program was implemented to check the validity of the two
 classification methods.  
--- a/notes/todo
+++ b/notes/todo
@ -1,3 +1,6 @@
 - riscrivere il readme di: 1, 2, 6
 - rileggere il 7
 - completare il 2
+- riscrivere il 2
+
+On the lambert W function, formula 4.19 Corless