diff options
Diffstat (limited to 'general.tex')
| -rw-r--r-- | general.tex | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/general.tex b/general.tex index 5932c5f..12f88e8 100644 --- a/general.tex +++ b/general.tex @@ -74,8 +74,8 @@ Selecting experiments that maximize the information gain in the Bayesian setup l where $h\in \mathcal{H}$ for some subset $\mathcal{H}$ of all possible mappings $h:\Omega\to\reals$, called the \emph{hypothesis space}, and $\varepsilon_i$ are random variables in $\reals$, not necessarily identically distributed, that are independent \emph{conditioned on $h$}. This model is quite broad, and captures many learning tasks, such as: \begin{enumerate} \item\textbf{Generalized Linear Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of linear maps $\{h(x) = \T{\beta}x \text{ s.t. } \beta\in \reals^d\}$, and $\varepsilon_i$ are independent zero-mean normal variables, where $\expt{\varepsilon_i^2}=\sigma_i$. -\item\textbf{Logistic Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that $$\varepsilon_i=\begin{cases} 1- h(x),& \text{w.~prob.}~h(x)\\-h(x),&\text{w.~prob}1-h(x)\end{cases}$$ -\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega\times\{0,1\}}$, and $$\varepsilon_i =\begin{cases}0, &\text{w.~prob.}p\\\bar{h}(x)-h(x), \text{w.~prob.}1-p\end{cases}$$ +\item\textbf{Logistic Regression.} $\Omega=\reals^d$, $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that $$\varepsilon_i=\begin{cases} 1- h(x_i),& \text{w.~prob.}~h(x_i)\\-h(x_i),&\text{w.~prob.}~1-h(x_i)\end{cases}$$ +\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega\times\{0,1\}}$, and $$\varepsilon_i =\begin{cases}0, &\text{w.~prob.}~p\\\bar{h}(x_i)-h(x_i), \text{w.~prob.}~1-p\end{cases}$$ \end{enumerate} In this setup, assume that the experimenter has a prior distribution on the hypothesis $h\in \mathcal{H}$. Then, the information gain objective can be written again as the mutual information between $\beta$ and $Y_S$. |
