3 files changed, 58 insertions, 18 deletions
diff --git a/definitions.tex b/definitions.tex
index 526f544..de551ca 100644
--- a/definitions.tex
+++ b/definitions.tex
@@ -5,6 +5,7 @@
 \newtheorem{example}{Example}
 \newtheorem{prop}{Proposition}
 \newtheorem{theorem}{Theorem}
+\newtheorem{corollary}{Corollary}
 %\newcommand*{\defeq}{\stackrel{\text{def}}{=}}
 \newcommand*{\defeq}{\equiv}
 \newcommand{\var}{\mathop{\mathrm{Var}}}
@@ -26,4 +27,4 @@
 \newcommand{\E}{{\tt E}}
 \newcommand{\id}{\mathbbm{1}}
 \newcommand{\junk}[1]{}
-\newcommand{\edp}{{\tt EDP}}
-\ No newline at end of file
+\newcommand{\edp}{{\tt EDP}}
diff --git a/general.tex b/general.tex
index 3815c83..49adab2 100644
--- a/general.tex
+++ b/general.tex
@@ -5,11 +5,13 @@ has a natural interpretation in this context, further motivating its selection
 as our objective.  Moreover, we extend Theorem~\ref{thm:main} to a more general
 Bayesian setting.
 
-In the Bayesian setting, it is assumed that the experimenter has a prior distribution on $\beta$: in particular, $\beta$ has a multivariate normal prior with zero mean  and covariance $\sigma^2R\in \reals^{d^2}$ (where $\sigma^2$ is the noise variance). 
+In the Bayesian setting, it is assumed that the experimenter has a prior
+distribution on $\beta$: in particular, $\beta$ has a multivariate normal prior
+with zero mean  and covariance $\sigma^2R^{-1}\in \reals^{d^2}$ (where $\sigma^2$ is the noise variance). 
 The experimenter estimates $\beta$ through \emph{maximum a posteriori estimation}: \emph{i.e.}, finding the parameter which maximizes the posterior distribution of $\beta$ given the observations $y_S$. Under the linearity assumption \eqref{model} and the Gaussian prior on $\beta$, maximum a posteriori estimation leads to the following maximization \cite{hastie}: 
 \begin{displaymath}
     \hat{\beta} = \argmin_{\beta\in\reals^d} \sum_i (y_i - \T{\beta}x_i)^2
-    + \sum_i \norm{R\beta}_2^2
+    + \T{\beta}R\beta
 \end{displaymath}
 This optimization, commonly known as \emph{ridge regression}, includes an additional penalty term compared to the least squares estimation \eqref{leastsquares}.
 
@@ -47,7 +49,7 @@ Theorem~\ref{thm:main}:
 \begin{theorem}
  There exists a truthful, individually rational and budget feasible mechanism for the objective
  function $\tilde{V}$ given by \eqref{eq:normalized}. Furthermore, for any $\varepsilon
- > 0$, in time $O(\text{poly}(|\mathcal{N}|, d, \log\log \varepsilon^{-1}))$,
+ > 0$, in time $O(\text{poly}(n, d, \log\log \varepsilon^{-1}))$,
  the algorithm computes a set $S^*$ such that:
  \begin{displaymath}
     OPT \leq 
@@ -57,11 +59,18 @@ where $\mu$ is the smallest eigenvalue of $R$.
 \end{theorem}
 
 \subsection{Beyond Linear Models}
-Selecting experiments that maximize the information gain in the Bayesian setup leads to a natural generalization to other learning examples beyond linear regression. In particular, in the more general PAC learning setup \cite{...}, the features $x_i$, $i\in \mathcal{N}$ take values in some general set $\Omega$, called the \emph{query space}, and measurements $y_i\in\reals$ are given by 
+Selecting experiments that maximize the information gain in the Bayesian setup
+leads to a natural generalization to other learning examples beyond linear
+regression. In particular, in the more general PAC learning setup \cite{valiant}, the features $x_i$, $i\in \mathcal{N}$ take values in some general set $\Omega$, called the \emph{query space}, and measurements $y_i\in\reals$ are given by 
  \begin{equation}\label{eq:hypothesis-model}
     y_i = h(x_i) + \varepsilon_i
 \end{equation}
-where $h\in \mathcal{H}$ for some subset $\mathcal{H}$ of all possible mappings $h:\Omega\to\reals$, called the \emph{hypothesis space}, and $\varepsilon_i$ are random variables in $\reals$, not necessarily identically distributed, that are independent \emph{conditioned on $h$}. This model is quite broad, and captures many learning tasks, such as:
+where $h\in \mathcal{H}$ for some subset $\mathcal{H}$ of all possible mappings
+$h:\Omega\to\reals$, called the \emph{hypothesis space}, and $\varepsilon_i$
+are random variables in $\reals$, not necessarily identically distributed, that
+are independent \emph{conditioned on $h$}. This model is quite broad, and
+captures many learning tasks, such as support vector machines (SVM), linear
+discriminant analysis (LDA), and the following tasks:
 \begin{enumerate}
 \item\textbf{Generalized Linear Regression.} $\Omega=\reals^d$,  $\mathcal{H}$ is the set of linear maps $\{h(x) = \T{\beta}x \text{ s.t. } \beta\in \reals^d\}$, and $\varepsilon_i$ are independent zero-mean normal variables, where  $\expt{\varepsilon_i^2}=\sigma_i$. 
 \item\textbf{Logistic Regression.} $\Omega=\reals^d$,  $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that $$\varepsilon_i=\begin{cases} 1- h(x_i),& \text{w.~prob.}\;h(x_i)\\-h(x_i),&\text{w.~prob.}\;1-h(x_i)\end{cases}$$
@@ -77,27 +86,37 @@ This is a monotone set function, and it clearly satisfies $V(\emptyset)=0$. Thou
 The value function given by the information gain \eqref{general} is submodular.
 \end{lemma}
 \begin{proof}
-The lemma is proved in a slightly different context in \cite{krause2005near}; we
-repeat the proof here for the sake of completeness. Using the chain rule for
-the conditional entropy we get:
+Using the chain rule for the conditional entropy we get:
 \begin{equation}\label{eq:chain-rule}
     V(S) = H(y_S) - H(y_S \mid \beta) 
     = H(y_S) - \sum_{i\in S} H(y_i \mid \beta)
 \end{equation}
 where the second equality comes from the independence of the $y_i$'s
 conditioned on $\beta$. Recall that the joint entropy of a set of random
-variables is a submodular function. Thus, our value function is written in
-\eqref{eq:chain-rule} as the sum of a submodular function and a modular function. 
+variables is a submodular function. Thus, the value function is written in
+\eqref{eq:chain-rule} as the sum of a submodular function and a modular function.
 \end{proof}
 
-This lemma that implies that learning an \emph{arbitrary hypothesis, under an
+This lemma implies that learning an \emph{arbitrary hypothesis, under an
 arbitrary prior} when noise is conditionally independent leads to a submodular
-value function. Hence the universally truthful 7.91-approximate mechanism by
-Chen \emph{et al.} \cite{chen} immediately applies in the value query model.
-Moreover, in cases where maximizing \eqref{general} can be done in polynomial
-time in the full-information setup, the truthful $8.34$-appoximate mechanism of
-Chen \emph{et al.}~applies again. However that, in many scenarios covered by
-this model (inlcuding the last two examples above), even computing the entropy
+value function. Hence, we can apply the results from Chen \emph{et al.} to get
+the following corollary:
+\begin{corollary}
+    For Bayesian experimental design with the objective given by the
+    information gain \eqref{general}, there exists a randomized, budget
+    feasible, individually rational, and universally truthful mechanism. This
+    mechanism has a complexity $O\big(\text{poly}(n,d)\big)$ and an
+    approximation ratio of $7.91$.
+
+    In cases where maximizing \eqref{general} can be done in polynomial time in
+    the full-information setup, there exists a randomized, budget feasible,
+    individually rational, and truthful mechanism for Bayesian experimental
+    design. This mechanism has a complexity $O\big(\text{poly}(n,d)\big)$ and
+    an approximation ratio of $8.34$. 
+\end{corollary}
+
+Note however that, in many scenarios covered by
+this model (including the last two examples above), even computing the entropy
 under a given set might be a hard task---\emph{i.e.}, the value query model may
 not apply. Hence, identifying learning tasks in the above class for which
 truthful or universally truthful constant approximation mechanisms exist, or
diff --git a/notes.bib b/notes.bib
index 4b7f14c..78e7656 100644
--- a/notes.bib
+++ b/notes.bib
@@ -618,4 +618,24 @@ number = "2011/005",
 year = "2011"
 }
 
+@inproceedings{valiant,
+  author    = {Leslie G. Valiant},
+  title     = {A Theory of the Learnable},
+  booktitle = {STOC},
+  year      = {1984},
+  pages     = {436-445},
+  ee        = {http://doi.acm.org/10.1145/800057.808710},
+  crossref  = {DBLP:conf/stoc/STOC16},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+@proceedings{DBLP:conf/stoc/STOC16,
+  editor    = {Richard A. DeMillo},
+  title     = {Proceedings of the 16th Annual ACM Symposium on Theory of
+               Computing, April 30 - May 2, 1984, Washington, DC, USA},
+  booktitle = {STOC},
+  publisher = {ACM},
+  year      = {1984},
+  bibsource = {DBLP, http://dblp.uni-trier.de}
+}