summaryrefslogtreecommitdiffstats
path: root/general.tex
diff options
context:
space:
mode:
Diffstat (limited to 'general.tex')
-rw-r--r--general.tex15
1 files changed, 2 insertions, 13 deletions
diff --git a/general.tex b/general.tex
index 11c471f..4f9aaad 100644
--- a/general.tex
+++ b/general.tex
@@ -64,19 +64,8 @@ Selecting experiments that maximize the information gain in the Bayesian setup l
where $h\in \mathcal{H}$ for some subset $\mathcal{H}$ of all possible mappings $h:\Omega\to\reals$, called the \emph{hypothesis space}, and $\varepsilon_i$ are random variables in $\reals$, not necessarily identically distributed, that are independent \emph{conditioned on $h$}. This model is quite broad, and captures many learning tasks, such as:
\begin{enumerate}
\item\textbf{Generalized Linear Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of linear maps $\{h(x) = \T{\beta}x \text{ s.t. } \beta\in \reals^d\}$, and $\varepsilon_i$ are independent zero-mean normal variables, where $\expt{\varepsilon_i^2}=\sigma_i$.
-\item\textbf{Logistic Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that
-\begin{displaymath}
-\varepsilon_i=\begin{cases}
-1- h(x),& \text{w.~prob.}\;h(x)\\
--h(x),& \text{w.~prob.}\;1-h(x)
-\end{cases}\end{displaymath}
-
-\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega}$, and
-\begin{displaymath}
-\varepsilon_i =\begin{cases}
-0, &\text{w.~prob.}\;p\\
-\bar{h}(x)-h(x), &\text{w.~prob.}\;1-p
-\end{cases}\end{displaymath}
+\item\textbf{Logistic Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that $$\varepsilon_i=\begin{cases} 1- h(x_i),& \text{w.~prob.}~h(x_i)\\-h(x_i),&\text{w.~prob.}~1-h(x_i)\end{cases}$$
+\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega\times\{0,1\}}$, and $$\varepsilon_i =\begin{cases}0, &\text{w.~prob.}~p\\\bar{h}(x_i)-h(x_i), \text{w.~prob.}~1-p\end{cases}$$
\end{enumerate}
In this setup, assume that the experimenter has a prior distribution on the hypothesis $h\in \mathcal{H}$. Then, the information gain objective can be written again as the mutual information between $\beta$ and $y_S$.