From e792b4c421b1a6d0b183562a743f62c3bb333535 Mon Sep 17 00:00:00 2001
From: unknown <Brano@Toshibicka.(none)>
Date: Fri, 2 Mar 2012 01:42:21 -0800
Subject: Algorithms

---
 algorithm.tex  | 24 +++++++++++++++++-------
 references.bib | 18 +++++++++++++++++-
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/algorithm.tex b/algorithm.tex
index c9130cc..df2fdf9 100644
--- a/algorithm.tex
+++ b/algorithm.tex
@@ -7,23 +7,33 @@ Provide a guideline for the rest of the section.
 \subsection{Mixture of Gaussians}
 \label{sec:mixture of Gaussians}
 
-The nearest-neighbor (NN) classifier worked well on the dead body dataset. It is a well-know fact that the decision boundary of the NN classifier is piecewise linear [cite]. A mixture of Gaussians [cite] is a generative model that can be used for NN classification. In our domain, each person is represented by a single Gaussian, which is centered in the person's mean profile. All Gaussians have the same covariance matrix, which encodes the variance and covariance of attributes around the mean profiles. In particular, our probabilistic model is given by:
+The nearest-neighbor (NN) classifier worked well on the dead body dataset. It is a well-know fact that the decision boundary of the NN classifier is piecewise linear \cite{bishop06pattern}. A mixture of Gaussians \cite{bishop06pattern} is a generative probabilistic model that can be used for NN classification. In our domain, each person is represented by a single Gaussian, which is centered in the person's mean profile. All Gaussians have the same covariance matrix, which encodes the variance and covariance of attributes around the mean profiles. In particular, our probabilistic model is given by:
 \begin{align}
-  P(\bx, y) =  \cN(\bx | \bu_y, \Sigma) P(y),
+  P(\bx, y) =  \cN(\bx | \bar{\bx}_y, \Sigma) P(y),
   \label{eq:mixture of Gaussians}
 \end{align}
-where $P(y)$ is the probability that the person y appears in front of the camera and $\cN(\bx | \bu_y, \Sigma)$ is the probability that the profile $\bx$ belongs to the person $y$. The probability is modeled as a normal distribution, which is centered in the mean profile of the person $y$ and has a covariance matrix $\Sigma$. Since all Gaussians in the mixture have the same covariance matrix, all decision boundaries, $P(\bx, y_1) = P(\bx, y_2)$ for some $y_1$ and $y_2$, are linear.
+where $P(y)$ is the probability that the person y appears in front of the camera and $\cN(\bx | \bar{\bx}_y, \Sigma)$ is the probability that the profile $\bx$ belongs to the person $y$. The probability $\cN(\bx | \bar{\bx}_y, \Sigma)$ is a normal distribution, which is centered in the mean profile of the person $y$ and has a covariance matrix $\Sigma$. Since all Gaussians in the mixture have the same covariance matrix, all decision boundaries, $P(\bx, y_1) = P(\bx, y_2)$ for any $y_1$ and $y_2$, are linear.
 
-The parameters of our model can be easily learned using maximum-likelihood (ML) estimation [cite]. The probability $P(y)$ is the fraction of time when the person $y$ appears in training set. The mean profile $\bu_y$ is estimated as the mean of all profiles corresponding to the person $y$. The covariance matrix $\Sigma$ is estimated as $\Sigma = \sum_y P(y) \Sigma_y$, where $\Sigma_y$ is the covariance matrix of profiles for the person $y$.
+The parameters of our model are learned using maximum-likelihood (ML) estimation \cite{bishop06pattern}. The probability $P(y)$ is the fraction of time when the person $y$ appears in training set. The mean profile $\bar{\bx}_y$ is estimated as the mean of all profiles corresponding to the person $y$. The covariance matrix $\Sigma$ is estimated as a weighted sum $\Sigma = \sum_y P(y) \Sigma_y$, where $\Sigma_y$ is the covariance matrix of profiles for the person $y$.
 
-The inference in our model can be done efficiently. In particular, note that:
+The inference in our model can be performed in a closed form. In particular, note that the predicted label is given by $\hat{y} = \arg\max_y P(y | \bx)$, where:
 \begin{align}
   P(y | \bx) =
   \frac{P(\bx | y) P(y)}{\sum_y P(\bx | y) P(y)} =
-  \frac{\cN(\bx | \bu_y, \Sigma) P(y)}{\sum_y \cN(\bx | \bu_y, \Sigma) P(y)}.
+  \frac{\cN(\bx | \bar{\bx}_y, \Sigma) P(y)}{\sum_y \cN(\bx | \bar{\bx}_y, \Sigma) P(y)}.
   \label{eq:inference}
 \end{align}
+The prediction $\hat{y}$ is accepted when the classifier is sufficiently confident. In other words, $P(\hat{y} | \bx) \! > \! h$, where $h \in (0, 1)$ is a threshold that controls the precision and recall of the classifier. In practice, the higher the value of $h$, the lower the recall and the higher the precision.
 
 
 \subsection{Sequential hypothesis testing}
-\label{sec:sequential hypothesis testing}
+\label{sec:SHT}
+
+The mixture of Gaussians model can be extended to temporal inference through sequential hypothesis testing. Sequential hypothesis testing \cite{wald47sequential} is an established statistical framework, in which a subject is sequentially tested for belonging to one of many classes. The probability that a sequence of data $\bx^{(1)}, \dots, \bx^{(t)}$ belongs to the class $y$ at time $t$ is given by:
+\begin{align}
+  P(y | \bx^{(1)}, \dots, \bx^{(t)}) =
+  \frac{\prod_{i = 1}^t \cN(\bx^{(i)} | \bar{\bx}_y, \Sigma) P(y)}
+  {\sum_y \prod_{i = 1}^t \cN(\bx^{(i)} | \bar{\bx}_y, \Sigma) P(y)}.
+  \label{eq:SHT}
+\end{align}
+The result $\hat{y} = \arg\max_y P(y | \bx^{(1)}, \dots, \bx^{(t)})$ is accepted when $P(\hat{y} | \bx^{(1)}, \dots, \bx^{(t)}) > h$, where the threshold $h \in (0, 1)$ controls the precision and recall of the predictor.
diff --git a/references.bib b/references.bib
index 0f41363..07472fe 100644
--- a/references.bib
+++ b/references.bib
@@ -1,4 +1,4 @@
-﻿
+
 
 #@string{PROC = "Proc. "}
 @string{PROC = "Proceedings of the "}
@@ -390,3 +390,19 @@ pages = "331--342"
   year={2011},
   organization={IEEE}
 }
+
+@book{ bishop06pattern,
+  author = "Christopher Bishop",
+  title = "Pattern Recognition and Machine Learning",
+  publisher = "Springer",
+  address = "New York, NY",
+  year = "2007"
+}
+
+@book{ wald47sequential,
+  author = "Abraham Wald",
+  title = "Sequential Analysis",
+  publisher = "John Wiley and Sons",
+  address = "New York, NY",
+  year = "1947"
+}
-- 
cgit v1.2.3-70-g09d2