2 files changed, 28 insertions, 31 deletions
diff --git a/general.tex b/general.tex
index 5f80c58..87475a4 100644
--- a/general.tex
+++ b/general.tex
@@ -11,8 +11,8 @@ matrix $R$, the value function takes the general form given by
 \eqref{dcrit}.
 
 Applying the mechanism described in Algorithm~\ref{mechanism} and adapting the
-analysis of the approximation ratio, we get the following result which extends
-Theorem~\ref{thm:main}:
+analysis of the approximation ratio \emph{mutatis mutandis}, we get the following result which extends
+Theorem~\ref{thm:main}. 
 
 \begin{theorem}
  There exists a truthful, individually rational and budget feasible mechanism for the objective
@@ -30,8 +30,8 @@ where $\mu$ is the smallest eigenvalue of $R$.
 
 In the non-bayesian setting, \emph{i.e.} when the experimenter has no prior
 distribution on the model, the covariance matrix $R$ is the zero matrix. In this case,
-the ridge regression estimation proceedure \eqref{ridge} reduces to simple least squares (\emph{i.e.}, linear regression),
-and the $D$-optimality reduces to the entropy of $\hat{\beta}$, given by:
+the ridge regression estimation procedure \eqref{ridge} reduces to simple least squares (\emph{i.e.}, linear regression),
+and the $D$-optimality criterion reduces to the entropy of $\hat{\beta}$, given by:
 \begin{equation}\label{eq:d-optimal}
 V(S) = \log\det(X_S^TX_S)
 \end{equation}
@@ -58,37 +58,36 @@ value function $V(S) = \det{\T{X_S}X_S}$.
 \subsection{Beyond Linear Models}
 Selecting experiments that maximize the information gain in the Bayesian setup
 leads to a natural generalization to other learning examples beyond linear
-regression. In particular, in the more general PAC learning setup \cite{valiant}, the features $x_i$, $i\in \mathcal{N}$ take values in some general set $\Omega$, called the \emph{query space}. Measurements $y_i\in\reals$ are given by 
+regression. In particular, consider the following variant of the standard PAC learning setup \cite{valiant}: assume that the features $x_i$, $i\in \mathcal{N}$ take values in some generic set $\Omega$, called the \emph{query space}. 
+Measurements $y_i\in\reals$ are given by 
  \begin{equation}\label{eq:hypothesis-model}
     y_i = h(x_i) + \varepsilon_i
 \end{equation}
 where $h\in \mathcal{H}$ for some subset $\mathcal{H}$ of all possible mappings
-$h:\Omega\to\reals$, called the \emph{hypothesis space}, and $\varepsilon_i$
+$h:\Omega\to\reals$, called the \emph{hypothesis space}. As before, we assume that the experimenter has a prior distribution on the
+hypothesis $h\in \mathcal{H}$; we also assume that  $\varepsilon_i$
 are random variables in $\reals$, not necessarily identically distributed, that
-are independent \emph{conditioned on $h$}. This model is quite broad, and
-captures many learning tasks, such as support vector machines (SVM) and linear
-discriminant analysis (LDA); we give a few concrete examples below:
+are independent \emph{conditioned on $h$}. As before, the features $x_i$ are public, and the goal of the experimenter is to (a) retrieve measurements $y_i$ and (b) estimate $h$ as accurately as possible. 
+
+
+This model is quite broad, and
+captures many classic machine learning tasks; we give a few concrete examples below:
 \begin{enumerate}
-\item\textbf{Generalized Linear Regression.} $\Omega=\reals^d$,  $\mathcal{H}$ is the set of linear maps $\{h(x) = \T{\beta}x \text{ s.t. } \beta\in \reals^d\}$, and $\varepsilon_i$ are independent zero-mean normal variables, where  $\expt{\varepsilon_i^2}=\sigma_i$. 
-\item\textbf{Logistic Regression.} $\Omega=\reals^d$,  $\mathcal{H}$ is the set of maps $\{h(x) = \frac{e^{\T{\beta} x}}{1+e^{\T{\beta} x}} \text{ s.t. } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $h$ such that $$\varepsilon_i=\begin{cases} 1- h(x_i),& \text{w.~prob.}\;h(x_i)\\-h(x_i),&\text{w.~prob.}\;1-h(x_i)\end{cases}$$
-\item\textbf{Learning Binary Functions with Bernoulli Noise.} $\Omega = \{0,1\}^d$, and $\mathcal{H}$ is some subset of $2^{\Omega}$, and $$\varepsilon_i =\begin{cases}0, &\text{w.~prob.}\;p\\\bar{h}(x_i)-h(x_i), &\text{w.~prob.}\;1-p\end{cases}$$
+\item\textbf{Generalized Linear Regression.} In this case, $\Omega=\reals^d$, $\mathcal{H}$ is the set of linear maps $\{h(x) = \T{\beta}x \text{ s.t. } \beta\in \reals^d\}$, and $\varepsilon_i$ are independent zero-mean variables (not necessarily identically distributed). 
+\item\textbf{Learning Binary Functions with Bernoulli Noise.} When learning a binary function under noise, the experimenter wishes to determine a binary function $h$ by testing its output on differrent inputs; however, the output may be corrupted with probability $p$. Formally, $\Omega = \{0,1\}^d$, $\mathcal{H}$ is some subset of binary functions $h:\Omega\to\{0,1\}$, and $$\varepsilon_i =\begin{cases}0, &\text{w.~prob.}\;1-p\\\bar{h}(x_i)-h(x_i), &\text{w.~prob.}\;p\end{cases}$$
+\item\textbf{Logistic Regression.} Logistic regression aims to learn a hyperplane separating $+1$--labeled values from $-1$--labeled values; again, values can be corrupted, and the probability that a label is flipped drops with the distance from the hyperplane. Formally, $\Omega=\reals^d$,  $\mathcal{H}$ is the set of maps $\{h(x) = \mathop{\mathrm{sign}}(\beta^T x) \text{ for some } \beta\in\reals^d\}$, and $\varepsilon_i$ are independent conditioned on $\beta$ such that $$\varepsilon_i=\begin{cases} -2\cdot\id_{\beta^Tx>0}, & \text{w.~prob.} \frac{1}{1+e^{\beta^Tx}}\;\\ +2\cdot\id_{\beta^Tx<0},&\text{w.~prob.}\frac{e^{\beta^Tx}}{1+e^{\beta^Tx}}\;\end{cases}$$
 \end{enumerate}
 
-In this setup, assuming that the experimenter has a prior distribution on the
-hypothesis $h\in \mathcal{H}$, the information gain objective can be written
-again as the mutual information between $h$ and $y_S$.
+We can again define the information gain as an objective to maximize:
 \begin{align}\label{general}
 V(S) = \entropy(h) -\entropy(h\mid y_S),\quad S\subseteq\mathcal{N} 
 \end{align}
 This is a monotone set function, and it clearly satisfies $V(\emptyset)=0$.
-Though, in general, mutual information is not a submodular function, this
-specific setup leads to a submodular formulation.
-
-\begin{lemma}
+ In general, the information gain is not a submodular function. However, when the errors $\epsilon_i$ are independent conditioned on $h$, the following lemma holds: \begin{lemma}
 The value function given by the information gain \eqref{general} is submodular.
 \end{lemma}
 \begin{proof}
-Using the chain rule for the conditional entropy we get:
+A more general statement for graphical models is shown in~\cite{krause2005near}; in short, using the chain rule for the conditional entropy we get:
 \begin{equation}\label{eq:chain-rule}
     V(S) = H(y_S) - H(y_S \mid h) 
     = H(y_S) - \sum_{i\in S} H(y_i \mid h)
@@ -101,20 +100,18 @@ variables is a submodular function. Thus, the value function is written in
 
 This lemma implies that learning an \emph{arbitrary hypothesis, under an
 arbitrary prior} when noise is conditionally independent leads to a submodular
-value function. Hence, we can apply the previously known results to get
+value function. Hence, we can apply the previously known results by \citeN{singer-mechanisms} and \citeN{chen} to get
 the following corollary:
 \begin{corollary}
-    For Bayesian experimental design with the objective given by the
-    information gain \eqref{general}, there exists a randomized, budget
-    feasible, individually rational, and universally truthful mechanism. This
-    mechanism has a complexity $O\big(\text{poly}(n,d)\big)$ and an
-    approximation ratio of $7.91$.
+    For Bayesian experimental design with an objective given by the
+    information gain \eqref{general}, there exists a  randomized, polynomial-time, budget
+    feasible, individually rational, and universally truthful mechanism with
+    a $7.91$ approximation ratio, in expectation.
 
     In cases where maximizing \eqref{general} can be done in polynomial time in
-    the full-information setup, there exists a randomized, budget feasible,
+    the full-information setup, there exists a deterministic,  polynomial-time, budget feasible,
     individually rational, and truthful mechanism for Bayesian experimental
-    design. This mechanism has a complexity $O\big(\text{poly}(n,d)\big)$ and
-    an approximation ratio of $8.34$. 
+    design with an $8.34$ approximation ratio. 
 \end{corollary}
 
 Note however that, in many scenarios covered by this model (including the last
diff --git a/problem.tex b/problem.tex
index 536481a..d505280 100644
--- a/problem.tex
+++ b/problem.tex
@@ -77,8 +77,8 @@ prior covariance is the identity matrix, \emph{i.e.}, $R=I_d\in \reals^{d\times
 d}.$ Intuitively, this corresponds to the simplest prior, in which no direction
 of $\reals^d$ is a priori favored; equivalently, it also corresponds to the
 case where ridge regression estimation \eqref{ridge} performed by $\E$ has
-a penalty term $\norm{\beta}_2^2$. A generalization of our results to general
-matrices $R$ can be found in Section~\ref{sec:ext}.
+a penalty term $\norm{\beta}_2^2$. A generalization of our results to arbitrary 
+covariance matrices $R$ can be found in Section~\ref{sec:ext}.
 
  %Note that \eqref{dcrit} is a submodular set function, \emph{i.e.}, 
 %$V(S)+V(T)\geq V(S\cup T)+V(S\cap T)$ for all $S,T\subseteq \mathcal{N}$; it is also monotone, \emph{i.e.}, $V(S)\leq V(T)$ for all $S\subset T$.