diff options
Diffstat (limited to 'general.tex')
| -rw-r--r-- | general.tex | 15 |
1 files changed, 2 insertions, 13 deletions
diff --git a/general.tex b/general.tex index 11c471f..4f9aaad 100644 --- a/general.tex +++ b/general.tex @@ -64,19 +64,8 @@ Selecting experiments that maximize the information gain in the Bayesian setup l where $h\in \mathcal{H}$ for some subset $\mathcal{H}$ of all possible mappings $h:\Omega\to\reals$, called the \emph{hypothesis space}, and $\varepsilon_i$ are random variables in $\reals$, not necessarily identically distributed, that are independent \emph{conditioned on $h$}. This model is quite broad, and captures many learning tasks, such as: \begin{enumerate} \item\textbf{Generalized Linear Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of linear maps $\{h(x) = \T{\beta}x \text{ s.t. } \beta\in \reals^d\}$, and $\varepsilon_i$ are independent zero-mean normal variables, where $\expt{\varepsilon_i^2}=\sigma_i$. -\item\textbf{Logistic Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that -\begin{displaymath} -\varepsilon_i=\begin{cases} -1- h(x),& \text{w.~prob.}\;h(x)\\ --h(x),& \text{w.~prob.}\;1-h(x) -\end{cases}\end{displaymath} - -\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega}$, and -\begin{displaymath} -\varepsilon_i =\begin{cases} -0, &\text{w.~prob.}\;p\\ -\bar{h}(x)-h(x), &\text{w.~prob.}\;1-p -\end{cases}\end{displaymath} +\item\textbf{Logistic Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that $$\varepsilon_i=\begin{cases} 1- h(x_i),& \text{w.~prob.}~h(x_i)\\-h(x_i),&\text{w.~prob.}~1-h(x_i)\end{cases}$$ +\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega\times\{0,1\}}$, and $$\varepsilon_i =\begin{cases}0, &\text{w.~prob.}~p\\\bar{h}(x_i)-h(x_i), \text{w.~prob.}~1-p\end{cases}$$ \end{enumerate} In this setup, assume that the experimenter has a prior distribution on the hypothesis $h\in \mathcal{H}$. Then, the information gain objective can be written again as the mutual information between $\beta$ and $y_S$. |
